In [1]:
"""
NOTEBOOK 02: DE-IDENTIFICATION PIPELINE
Remove PHI (Protected Health Information) from clinical notes
"""

import os
import json
import re
from datetime import datetime
import pandas as pd

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
PROJECT_ROOT = "/content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project"
INPUT_DIR = f"{PROJECT_ROOT}/01_data_generation/outputs/raw_clinical_notes"
OUTPUT_DIR = f"{PROJECT_ROOT}/02_data_preprocessing/outputs"
DEID_DIR = f"{OUTPUT_DIR}/deidentified_notes"
LOGS_DIR = f"{OUTPUT_DIR}/deidentification_logs"

os.makedirs(DEID_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

# Load patient metadata
with open(f"{PROJECT_ROOT}/01_data_generation/outputs/patient_metadata.json", 'r') as f:
    metadata = json.load(f)

patients = metadata['patients']
print(f"‚úÖ Setup complete")
print(f"üìÇ Input: {INPUT_DIR}")
print(f"üìÇ Output: {DEID_DIR}")
print(f"üìä Patients to process: {len(patients)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Setup complete
üìÇ Input: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/01_data_generation/outputs/raw_clinical_notes
üìÇ Output: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentified_notes
üìä Patients to process: 10


In [2]:
# PHI patterns to remove/replace
PHI_PATTERNS = {
    'names': r'Patient: \b[A-Z][a-z]+ [A-Z][a-z]+\b',  # First Last
    'dates': r'\d{4}-\d{2}-\d{2}',
    'ages': r'\b\d{2}\s+years?\b',
    'doctors': r'Dr\.\s+[A-Z][a-z]+',
    'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
    'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
}

# Replacement tokens
REPLACEMENTS = {
    'names': 'Patient: [PATIENT_NAME]',
    'dates': '[DATE]',
    'ages': '[AGE] years',
    'doctors': 'Dr. [PHYSICIAN]',
    'phone': '[PHONE]',
    'email': '[EMAIL]'
}

def deidentify_text(text, patient_name=None):
    """Remove PHI from clinical text"""
    deid_text = text
    redactions = []

    # Replace each PHI pattern
    for phi_type, pattern in PHI_PATTERNS.items():
        matches = re.findall(pattern, deid_text)
        if matches:
            for match in set(matches):  # Unique matches only
                redactions.append({
                    'type': phi_type,
                    'original': match,
                    'replacement': REPLACEMENTS[phi_type]
                })
            deid_text = re.sub(pattern, REPLACEMENTS[phi_type], deid_text)

    return deid_text, redactions

# Test on sample
sample_text = "Patient: John Smith (ID: patient_001)\nDate: 2024-05-15 | Age: 58 years"
deid_sample, redactions = deidentify_text(sample_text)
print("Original:", sample_text[:80])
print("De-identified:", deid_sample[:80])
print(f"‚úÖ Redacted {len(redactions)} PHI elements")

Original: Patient: John Smith (ID: patient_001)
Date: 2024-05-15 | Age: 58 years
De-identified: [PATIENT_NAME] (ID: patient_001)
Date: [DATE] | Age: [AGE] years
‚úÖ Redacted 3 PHI elements


In [3]:
# De-identify all notes
total_files = 0
total_redactions = 0
all_logs = []

for patient in patients:
    patient_id = patient['patient_id']
    patient_input_dir = f"{INPUT_DIR}/{patient_id}"
    patient_output_dir = f"{DEID_DIR}/{patient_id}"
    os.makedirs(patient_output_dir, exist_ok=True)

    # Process each visit note
    for filename in os.listdir(patient_input_dir):
        if not filename.endswith('.txt'):
            continue

        # Read original note
        with open(f"{patient_input_dir}/{filename}", 'r') as f:
            original_text = f.read()

        # De-identify
        deid_text, redactions = deidentify_text(original_text, patient['name'])

        # Save de-identified note
        with open(f"{patient_output_dir}/{filename}", 'w') as f:
            f.write(deid_text)

        # Log redactions
        all_logs.append({
            'patient_id': patient_id,
            'filename': filename,
            'num_redactions': len(redactions),
            'redaction_types': list(set([r['type'] for r in redactions])),
            'redactions': redactions
        })

        total_files += 1
        total_redactions += len(redactions)

print(f"‚úÖ Processed {total_files} files")
print(f"üìä Total redactions: {total_redactions}")
print(f"üìä Avg redactions per file: {total_redactions/total_files:.1f}")

‚úÖ Processed 36 files
üìä Total redactions: 108
üìä Avg redactions per file: 3.0


In [4]:
# Validate de-identification quality
validation_results = {
    'total_files': total_files,
    'files_with_phi': 0,
    'phi_types_found': {},
    'sample_leaks': []
}

# Check for remaining PHI in de-identified notes
for patient in patients:
    patient_output_dir = f"{DEID_DIR}/{patient['patient_id']}"

    for filename in os.listdir(patient_output_dir):
        with open(f"{patient_output_dir}/{filename}", 'r') as f:
            deid_text = f.read()

        # Check for patient name (should be redacted)
        if patient['name'] in deid_text:
            validation_results['files_with_phi'] += 1
            validation_results['sample_leaks'].append({
                'file': filename,
                'leak': 'patient_name'
            })

        # Check for original patient_id pattern
        if re.search(r'patient_\d{3}', deid_text):
            if 'patient_id' not in validation_results['phi_types_found']:
                validation_results['phi_types_found']['patient_id'] = 0
            validation_results['phi_types_found']['patient_id'] += 1

# Validation summary
print("="*70)
print("VALIDATION RESULTS")
print("="*70)
print(f"‚úÖ Files processed: {validation_results['total_files']}")
print(f"‚ö†Ô∏è  Files with potential PHI leaks: {validation_results['files_with_phi']}")
print(f"‚úÖ De-identification success rate: {((total_files - validation_results['files_with_phi'])/total_files)*100:.1f}%")

if validation_results['files_with_phi'] == 0:
    print("\n‚úÖ All validation checks passed!")
else:
    print(f"\n‚ö†Ô∏è Warning: Found {validation_results['files_with_phi']} potential leaks")

VALIDATION RESULTS
‚úÖ Files processed: 36
‚ö†Ô∏è  Files with potential PHI leaks: 0
‚úÖ De-identification success rate: 100.0%

‚úÖ All validation checks passed!


In [5]:
# Save detailed redaction logs
logs_file = f"{LOGS_DIR}/redaction_logs.json"
with open(logs_file, 'w') as f:
    json.dump(all_logs, f, indent=2)

# Create summary statistics
redaction_stats = pd.DataFrame([{
    'patient_id': log['patient_id'],
    'filename': log['filename'],
    'num_redactions': log['num_redactions'],
    'types': ', '.join(log['redaction_types'])
} for log in all_logs])

stats_file = f"{OUTPUT_DIR}/deidentification_summary.csv"
redaction_stats.to_csv(stats_file, index=False)

# Redaction type breakdown
type_counts = {}
for log in all_logs:
    for redaction in log['redactions']:
        phi_type = redaction['type']
        type_counts[phi_type] = type_counts.get(phi_type, 0) + 1

print("\nüìä Redaction Breakdown by Type:")
for phi_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  ‚Ä¢ {phi_type}: {count}")

print(f"\n‚úÖ Logs saved to: {LOGS_DIR}")
print(f"‚úÖ Summary saved to: {stats_file}")


üìä Redaction Breakdown by Type:
  ‚Ä¢ names: 36
  ‚Ä¢ dates: 36
  ‚Ä¢ doctors: 36

‚úÖ Logs saved to: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentification_logs
‚úÖ Summary saved to: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentification_summary.csv


In [6]:
# Show before/after comparison
sample_patient = patients[0]
sample_file = f"visit_1_{sample_patient['visit_dates'][0]}.txt"

# Original
with open(f"{INPUT_DIR}/{sample_patient['patient_id']}/{sample_file}", 'r') as f:
    original = f.read()

# De-identified
with open(f"{DEID_DIR}/{sample_patient['patient_id']}/{sample_file}", 'r') as f:
    deidentified = f.read()

print("="*70)
print("BEFORE DE-IDENTIFICATION (First 400 chars)")
print("="*70)
print(original[:400])
print("\n" + "="*70)
print("AFTER DE-IDENTIFICATION (First 400 chars)")
print("="*70)
print(deidentified[:400])

BEFORE DE-IDENTIFICATION (First 400 chars)
CLINICAL NOTE
Patient: Allison Hill (ID: patient_001)
Date: 2024-05-01 | Age: 69 | Gender: Male
Visit Type: Initial Consultation

CHIEF COMPLAINT:
Follow-up for Asthma management.

HISTORY:
Patient presents for routine follow-up. Reports: shortness of breath, coughing. Managing Type 2 Diabetes, Hyperlipidemia, Asthma.

CURRENT 

AFTER DE-IDENTIFICATION (First 400 chars)
CLINICAL NOTE
[PATIENT_NAME] (ID: patient_001)
Date: [DATE] | Age: 69 | Gender: Male
Visit Type: Initial Consultation

CHIEF COMPLAINT:
Follow-up for Asthma management.

HISTORY:
Patient presents for routine follow-up. Reports: shortness of breath, coughing. Managing Type 2 Diabetes, Hyperlipidemia, Asthma.

CURRENT MEDICATIONS


In [7]:
# MLOps artifact logging
mlops_log = {
    "notebook": "02_deidentification_pipeline",
    "execution_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "data_version": "v1.0",
    "inputs": {
        "raw_notes_dir": INPUT_DIR,
        "num_patients": len(patients),
        "total_input_files": total_files
    },
    "outputs": {
        "deidentified_notes_dir": DEID_DIR,
        "logs_dir": LOGS_DIR,
        "summary_file": stats_file
    },
    "statistics": {
        "total_files_processed": total_files,
        "total_redactions": total_redactions,
        "avg_redactions_per_file": round(total_redactions/total_files, 2),
        "redaction_type_breakdown": type_counts
    },
    "validation": validation_results,
    "phi_patterns": list(PHI_PATTERNS.keys())
}

mlops_log_path = f"{OUTPUT_DIR}/mlops_deidentification_log.json"
with open(mlops_log_path, 'w') as f:
    json.dump(mlops_log, f, indent=2)

print("\n" + "="*70)
print("‚úÖ NOTEBOOK 02 COMPLETE")
print("="*70)
print(f"""
üìÇ Outputs:
  ‚Ä¢ De-identified notes: {DEID_DIR}
  ‚Ä¢ Redaction logs: {LOGS_DIR}
  ‚Ä¢ Summary: {stats_file}
  ‚Ä¢ MLOps log: {mlops_log_path}

üìä Summary:
  ‚Ä¢ Files processed: {total_files}
  ‚Ä¢ Total redactions: {total_redactions}
  ‚Ä¢ Success rate: {((total_files - validation_results['files_with_phi'])/total_files)*100:.1f}%

""")


‚úÖ NOTEBOOK 02 COMPLETE

üìÇ Outputs:
  ‚Ä¢ De-identified notes: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentified_notes
  ‚Ä¢ Redaction logs: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentification_logs
  ‚Ä¢ Summary: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/deidentification_summary.csv
  ‚Ä¢ MLOps log: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/02_data_preprocessing/outputs/mlops_deidentification_log.json

üìä Summary:
  ‚Ä¢ Files processed: 36
  ‚Ä¢ Total redactions: 108
  ‚Ä¢ Success rate: 100.0%


