In [None]:
import os
import re
import time
import requests
import pandas as pd
import fitz  # PyMuPDF for PDFs
import docx  # For .docx extraction

# Hugging Face API Settings
HF_API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
HEADERS = {"Authorization": "Bearer hf_your_actual_token"}  # Replace with your HF token

# DLP Dictionary (Example)
icu_data_dict = {
    "Patient_Identifiers": {
        "Fields": {
            "Complete_Name": {
                "Field_Identifier": [r"\b(?:[Nn]ome)\b", r"\b[Nn]ome\s{0,3}(?:d[oa])?\s{0,3}[Pp]aciente\b"],
                "Content_Identifier": r"\b[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:da|de|do|dos|das|di|von)?\s{0,3}[A-ZÀ-Ÿ][a-zà-ÿ]+\b",
                "DLP_Strategy": "Detects full names."
            },
            "Diagnósticos": {
                "Field_Identifier": [r"\b(?:Diagnóstico|Diagnósticos|Doença)\b"],
                "Content_Identifier": None,  # Requires HF API classification
                "DLP_Strategy": "Uses NLP to classify medical diagnosis."
            }
        }
    }
}

# Hugging Face API Call for Classification
def classify_sensitive_data(text):
    """
    Uses HF API to classify text into categories & determines if it's sensitive.
    """
    payload = {
        "inputs": text,
        "parameters": {
            "candidate_labels": [
                "Medical Diagnosis", "Medication", "Clinical Notes",
                "Patient Identifiers", "General Information"
            ],
            "multi_label": True
        }
    }

    while True:
        response = requests.post(HF_API_URL, headers=HEADERS, json=payload)
        
        if response.status_code == 503:
            print("Model is still loading... Retrying in 10 seconds.")
            time.sleep(10)
        elif response.status_code == 200:
            results = response.json()
            labels = results["labels"]
            scores = results["scores"]

            # Print probability scores
            print("\n🔹 **Classification Probabilities:**")
            for label, score in zip(labels, scores):
                print(f"{label}: {score:.2%}")

            # Return the top classification label
            return labels[0] if scores[0] > 0.5 else "Non-Sensitive"

        else:
            print(f"Error: {response.status_code}, {response.text}")
            return None

# Function to Extract Text from Different File Types
def extract_text_from_file(file_path):
    """Extracts text from PDFs, DOCX, and CSV files."""
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text

    elif ext == ".docx":
        doc = docx.Document(file_path)
        return " ".join([para.text for para in doc.paragraphs])

    elif ext == ".csv":
        df = pd.read_csv(file_path, dtype=str, encoding="utf-8", errors="ignore")
        return " ".join(df.astype(str).values.flatten())

    return None  # Unsupported file type

# Function to Scan Folder for Sensitive Data
def scan_folder_for_sensitive_data(folder_path):
    """Traverses folders, extracts text, matches DLP patterns, and classifies sensitive data."""
    alerts = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_text = extract_text_from_file(file_path)

            if not file_text:
                continue  # Skip if no text extracted

            for category, data in icu_data_dict.items():
                for field_name, field_info in data["Fields"].items():
                    # Search for field identifiers
                    for field_pattern in field_info["Field_Identifier"]:
                        if re.search(field_pattern, file_text, re.IGNORECASE):
                            # If complex field, use HF API to classify
                            if field_info["Content_Identifier"] is None:
                                classification = classify_sensitive_data(file_text[:500])  # First 500 chars
                                alerts.append((file_path, file, field_name, classification))
                            else:
                                # Otherwise, match content regex
                                matches = re.findall(field_info["Content_Identifier"], file_text, re.IGNORECASE)
                                if matches:
                                    alerts.append((file_path, file, field_name, "Sensitive"))

    # Display Results
    df_alerts = pd.DataFrame(alerts, columns=["File Path", "Filename", "Detected Field", "Classification"])
    print(df_alerts)
    return df_alerts

# Run the Folder Scan
folder_to_scan = "/path/to/your/documents"  # Change to your target folder
scan_results = scan_folder_for_sensitive_data(folder_to_scan)
