In [None]:
#!/usr/bin/env python3
import os
import time
import logging
import requests
import pandas as pd
import fitz  # PyMuPDF for PDFs
from docx import Document
from docx.shared import Inches

# ==============================
# Configuration
# ==============================
# Hugging Face API Configuration:
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
API_TOKEN = "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"
HEADERS = {"Authorization": f"Bearer {API_TOKEN}"}

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# ==============================
# Classification Function
# ==============================
def classify_sensitive_data(text, max_retries=3):
    """
    Classifies text using Hugging Face API and categorizes as "Sensitive" or "Non-Sensitive".
    Retries up to max_retries if the model is loading.
    """
    payload = {
        "inputs": text,
        "parameters": {
            "candidate_labels": [
                "Medical Diagnosis", "Medication", "Clinical Notes",
                "Patient Identifiers", "General Information"
            ],
            "multi_label": True
        }
    }

    retries = 0
    while retries < max_retries:
        try:
            response = requests.post(API_URL, headers=HEADERS, json=payload)
        except Exception as e:
            logger.error(f"Error calling Hugging Face API: {e}")
            return None

        if response.status_code == 503:
            retries += 1
            logger.info("Model loading... Retrying in 10 seconds.")
            time.sleep(10)
        elif response.status_code == 200:
            try:
                results = response.json()
                labels = results.get("labels", [])
                sensitive_categories = {"Medical Diagnosis", "Medication", "Clinical Notes", "Patient Identifiers"}
                is_sensitive = any(label in sensitive_categories for label in labels)
                return "Sensitive" if is_sensitive else "Non-Sensitive"
            except Exception as e:
                logger.error(f"Error parsing API response: {e}")
                return None
        else:
            logger.error(f"Error: {response.status_code} - {response.text}")
            return None

    logger.error("Max retries exceeded for classification.")
    return None

# ==============================
# File Text Extraction
# ==============================
def extract_text_from_file(file_path):
    """
    Extracts text from PDF, DOCX, or CSV files.
    Returns None if file type is unsupported or an error occurs.
    """
    ext = os.path.splitext(file_path)[-1].lower()
    try:
        if ext == ".pdf":
            doc = fitz.open(file_path)
            text = " ".join(page.get_text("text") for page in doc)
            doc.close()
            return text
        elif ext == ".docx":
            doc = Document(file_path)
            return " ".join(para.text for para in doc.paragraphs)
        elif ext == ".csv":
            df = pd.read_csv(file_path, dtype=str, encoding="utf-8")
            return " ".join(df.astype(str).values.flatten())
    except Exception as e:
        logger.error(f"Error extracting text from {file_path}: {e}")
    return None

# ==============================
# PDF Processing Functions
# ==============================
def add_watermark_to_pdf(pdf_path, watermark_text="SENSITIVE DATA - observe company policy."):
    """
    Adds a watermark to a PDF file if not already present on each page.
    The watermark is applied by saving to a temporary file (with a full save) and then
    replacing the original file.
    """
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logger.error(f"Could not open PDF {pdf_path}: {e}")
        return

    if doc.is_encrypted:
        try:
            doc.authenticate("")  # Attempt to unlock with an empty password
        except Exception as e:
            logger.error(f"Skipping encrypted PDF {pdf_path}: {e}")
            doc.close()
            return

    modified = False
    for page in doc:
        # Check if the watermark text is already present on this page.
        # If not found, insert the watermark.
        if not page.search_for(watermark_text):
            page.insert_text(
                (50, 500),
                watermark_text,
                fontsize=15,
                color=(1, 0, 0),
                rotate=90
            )
            modified = True

    if not modified:
        logger.info(f"Watermark already present in PDF: {pdf_path}")
        doc.close()
        return

    try:
        # Save to a temporary file using a full (non-incremental) save.
        temp_pdf = pdf_path.replace(".pdf", "_watermarked.pdf")
        doc.save(temp_pdf, incremental=False)
        doc.close()
        # Replace the original file with the updated file.
        os.replace(temp_pdf, pdf_path)
        logger.info(f"Watermark added to PDF: {pdf_path}")
    except Exception as e:
        logger.error(f"Error processing PDF {pdf_path}: {e}")


def modify_pdf_metadata(pdf_path):
    """
    Modifies metadata of a PDF file to mark it as 'Sensitive'.
    Saves to a temporary file and then replaces the original file.
    """
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata or {}
        metadata["subject"] = "Sensitive Document"
        metadata["keywords"] = "Sensitive, Confidential, Restricted"
        metadata["producer"] = "DLP System"
        doc.set_metadata(metadata)
        
        # Save to a temporary file instead of modifying the original directly.
        temp_pdf = pdf_path.replace(".pdf", "_metadata.pdf")
        doc.save(temp_pdf)  # Full save to temp file
        doc.close()
        
        # Replace the original PDF with the updated one.
        os.replace(temp_pdf, pdf_path)
        logger.info(f"Metadata updated in PDF: {pdf_path}")
    except Exception as e:
        logger.error(f"Error modifying PDF metadata for {pdf_path}: {e}")



# ==============================
# DOCX Processing Functions
# ==============================
from docx.shared import RGBColor
from docx.shared import RGBColor, Pt

def add_watermark_to_docx(docx_path):
    """
    Adds a red, bold watermark header with font size 14 to a DOCX file.
    """
    try:
        doc = Document(docx_path)
        header = doc.sections[0].header
        
        # Use the first paragraph in the header or add a new one.
        if header.paragraphs:
            header_paragraph = header.paragraphs[0]
            # Clear any existing text in the paragraph.
            header_paragraph.clear()
        else:
            header_paragraph = header.add_paragraph()
        
        # Add a new run with the watermark text and set its formatting.
        run = header_paragraph.add_run("SENSITIVE DATA - observe company policy.")
        run.font.color.rgb = RGBColor(255, 0, 0)  # Set text color to red.
        run.font.bold = True                      # Make the text bold.
        run.font.size = Pt(14)                    # Set font size to 14 points.
        
        doc.save(docx_path)
        logger.info(f"Watermark added to DOCX: {docx_path}")
    except Exception as e:
        logger.error(f"Error processing DOCX {docx_path}: {e}")



def modify_docx_metadata(docx_path):
    """
    Modifies metadata of a DOCX file to mark it as 'Sensitive'.
    """
    try:
        doc = Document(docx_path)
        core_props = doc.core_properties
        core_props.subject = "Sensitive Document"
        core_props.keywords = "Sensitive, Confidential"
        core_props.author = "DLP System"
        doc.save(docx_path)
        logger.info(f"Metadata updated in DOCX: {docx_path}")
    except Exception as e:
        logger.error(f"Error modifying DOCX metadata for {docx_path}: {e}")

# ==============================
# CSV Processing Function
# ==============================
def process_csv_file(csv_path):
    """
    Adds a 'Sensitive_Flag' column to a CSV file.
    """
    try:
        df = pd.read_csv(csv_path, dtype=str, encoding="utf-8")
        df["Sensitive_Flag"] = "True"
        df.to_csv(csv_path, index=False, encoding="utf-8")
        logger.info(f"Sensitivity flag added to CSV: {csv_path}")
    except Exception as e:
        logger.error(f"Error processing CSV {csv_path}: {e}")

# ==============================
# Main Processing Function
# ==============================
def scan_folder_for_sensitive_data(folder_path):
    """
    Scans a folder for PDF, DOCX, and CSV files, classifies them for sensitive data,
    and applies watermarking and metadata modifications accordingly.
    """
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            text = extract_text_from_file(file_path)
            if not text:
                logger.debug(f"No text extracted from {file_path}. Skipping.")
                continue

            classification = classify_sensitive_data(text)
            if classification is None:
                logger.error(f"Could not classify {file_path}. Skipping.")
                continue

            logger.info(f"File: {file_path} -> Classification: {classification}")

            if classification == "Sensitive":
                ext = os.path.splitext(file)[-1].lower()
                if ext == ".pdf":
                    add_watermark_to_pdf(file_path)
                    modify_pdf_metadata(file_path)
                elif ext == ".docx":
                    add_watermark_to_docx(file_path)
                    modify_docx_metadata(file_path)
                elif ext == ".csv":
                    process_csv_file(file_path)
                else:
                    logger.warning(f"Unsupported file type for {file_path}")

# ==============================
# Command-line Interface
# ==============================


In [51]:
scan_folder_for_sensitive_data(folder_to_scan)

2025-02-08 19:49:21,671 [INFO] File: test_documents\test_file_1.csv -> Classification: Sensitive
2025-02-08 19:49:21,677 [INFO] Sensitivity flag added to CSV: test_documents\test_file_1.csv
2025-02-08 19:49:21,901 [INFO] File: test_documents\test_file_1.docx -> Classification: Sensitive
2025-02-08 19:49:21,965 [INFO] Watermark added to DOCX: test_documents\test_file_1.docx
2025-02-08 19:49:22,022 [INFO] Metadata updated in DOCX: test_documents\test_file_1.docx
2025-02-08 19:49:22,238 [INFO] File: test_documents\test_file_1.pdf -> Classification: Sensitive
2025-02-08 19:49:22,245 [INFO] Watermark already present in PDF: test_documents\test_file_1.pdf
2025-02-08 19:49:22,254 [INFO] Metadata updated in PDF: test_documents\test_file_1.pdf
2025-02-08 19:49:22,462 [INFO] File: test_documents\test_file_2.csv -> Classification: Sensitive
2025-02-08 19:49:22,468 [INFO] Sensitivity flag added to CSV: test_documents\test_file_2.csv
2025-02-08 19:49:22,675 [INFO] File: test_documents\test_file_2.d