In [None]:
import os, re, shutil
import pandas as pd
from tqdm import tqdm
from pdfminer.high_level import extract_text
from pathlib import Path

# === CONFIG ===
BASE_DIR = r"D:/SEDAR/2022"
year = 2022
OUTPUT_DIR = r"D:/SEDAR_texts"
os.makedirs(OUTPUT_DIR, exist_ok=True)
data = pd.read_csv("D:/SEDAR/matched_2022_clean.csv")
a = list(data["profile"])
b = list(data["sedar_name"])
c = list(data["gvkey"])


# === STEP 1. Keyword setup ===
CYBER_KEYWORDS = [
    "cyber", "data breach", "information security", "network intrusion",
    "data protection", "information systems", "data privacy",
    "personal data", "privacy", "data processing", "security incident",
    "private data", "information technology security", "security breach", "data restoration",
    " SOC ", "IT asset", "Privacy Legal Requirements", "unauthorized access"
]
# === STEP 2. Helper functions ===
def long_path(p):
    p = os.path.abspath(p)
    if os.name == "nt" and not p.startswith("\\\\?\\"):
        p = "\\\\?\\" + p
    return Path(p)

def safe_name(s):
    s = re.sub(r'[\\/*?:"<>|]', "_", s)
    s = re.sub(r"\s+", "_", s.strip())
    return s

def pdf_to_text(pdf_path):
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"⚠️ Failed to read {pdf_path}: {e}")
        return ""

def find_contract_type(text):
    patterns = [
        r"(?i)\b(services?|supply|licen[sc]e|distribution|outsourcing|data processing|consulting)\s+agreement\b",
        r"(?i)\b(master\s+services?\s+agreement)\b",
        r"(?i)\b(amend(ed|ment)\s+to\s+.*agreement)\b"
    ]
    for p in patterns:
        match = re.search(p, text)
        if match:
            return match.group(0).strip().title()
    return None

def extract_cyber_clauses(text, keywords):
    paras = re.split(r"\n\s*\n", text)
    matches = []
    for para in paras:
        if any(k.lower() in para.lower() for k in keywords):
            clean_para = re.sub(r'\s+', ' ', para).strip()
            if len(clean_para) > 30:  # skip tiny fragments
                matches.append(clean_para)
    return matches

# === STEP 3. Process all firms ===
#rows = []

for i in range(len(a)):
    print(i)
    for root, _, files in os.walk(BASE_DIR+"/"+str(a[i]).zfill(9)+ " " + b[i]):
        for f in files:
            if ".pdf" not in f.lower():
                t = f + ".pdf"
            elif not f.lower().endswith(".pdf"):
                t = f.lower().split(".pdf")[0] + f.lower().split(".pdf")[1] + ".pdf"
            else:
                t = f
            pdf_path = os.path.join(root, f)
            profile = a[i]  # e.g., "000031912 Gatos Silver, Inc"
            firm_name = b[i]

            # New name
            new_name = f"{safe_name(firm_name)}_{year}_{safe_name(t)}"
            new_path = os.path.join(OUTPUT_DIR, new_name)
            shutil.copy2(long_path(pdf_path), new_path)

            # Extract text and find clauses
            text = pdf_to_text(new_path)
            contract_type = find_contract_type(text)
            clauses = extract_cyber_clauses(text, CYBER_KEYWORDS)

            for clause in clauses:
                rows.append({
                        "profile": profile,
                        "gvkey": c[i],
                        "firm_name": firm_name,
                        "year": year,
                        "file_name": new_name,
                        "contract_type": contract_type,
                        "clause_text": clause
                })

# === STEP 4. Save results ===
df = pd.DataFrame(rows)
df.to_csv(f"D:/cyber_clauses_by_firm_{year}.csv", index=False, encoding="utf-8-sig")

print(f"✅ Extracted {len(df)} clauses from {len(df['file_name'].unique())} contracts.")
