In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
from IPython.display import HTML, display
import warnings
warnings.filterwarnings('ignore')

# Add src to path for imports
project_root = Path().absolute().parent
sys.path.append(str(project_root / 'src'))

# Import existing functions
from evaluation.metrics import PIIEvaluator
from evaluation.diagnostics import (
    get_transcript_cases_by_performance,
    create_three_stage_html_table  # New function we'll add
)
from baseline.presidio_framework import PurePresidioFramework
from utils.text_normaliser import TextNormaliser


In [2]:
# Configuration
DATA_PATH = project_root / '.data' / 'synthetic_call_transcripts_voice_to_texts.csv'
EVALUATION_MODE = 'business'  # Focus on PII protection rather than exact type matching
N_SAMPLES = 3  # Process first 3 transcripts for demo

print(f"🔍 Loading data from: {DATA_PATH}")

# Load the voice-to-text data (has duplicate columns)
if DATA_PATH.exists():
    raw_df = pd.read_csv(DATA_PATH)
    print(f"✅ Loaded raw data: {len(raw_df)} transcripts")
    print(f"📋 Columns: {list(raw_df.columns)}")
else:
    print(f"❌ Data file not found at {DATA_PATH}")
    raise FileNotFoundError(f"Please ensure {DATA_PATH} exists")

# Clean up duplicate columns and create canonical structure
print(f"\n🔧 Converting to canonical format...")
clean_df = pd.DataFrame({
    'call_id': raw_df['call_id'],
    'consultant_first_name': raw_df['consultant_first_name'],
    'member_number': raw_df['member_number'].astype(str),
    'member_first_name': raw_df.iloc[:, 3],  # First member_first_name column (canonical)
    'member_full_name': raw_df['member_full_name'],
    'member_mobile': raw_df.iloc[:, 6],  # First member_mobile column (canonical)
    'member_email': raw_df.iloc[:, 8],  # First member_email column (canonical)
    'member_address': raw_df['member_address'],
    'call_transcript': raw_df['call_transcript']
})

print(f"✅ Canonical format: {clean_df.shape}")
print(f"📊 Using first {N_SAMPLES} transcripts for demo")

# Select sample for demo
demo_df = clean_df.head(N_SAMPLES).copy()


🔍 Loading data from: c:\Users\ningw\Desktop\Repo\agentic-pii-deidentification_original\.data\synthetic_call_transcripts_voice_to_texts.csv
✅ Loaded raw data: 3 transcripts
📋 Columns: ['call_id', 'consultant_first_name', 'member_number', 'member_first_name', 'member_full_name', 'member_mobile', 'member_email', 'member_address', 'call_transcript']

🔧 Converting to canonical format...
✅ Canonical format: (3, 9)
📊 Using first 3 transcripts for demo


In [3]:
# Initialize components
normalizer = TextNormaliser()
framework = PurePresidioFramework(enable_mlflow=False)  # Disable MLflow for demo

# Process through three stages
print(f"🔄 Processing {len(demo_df)} transcripts through three-stage workflow...")

three_stage_results = []

for idx, row in demo_df.iterrows():
    call_id = row['call_id']
    
    # Stage A: Original transcript
    stage_a = row['call_transcript']
    
    # Stage B: Normalized transcript
    stage_b = normalizer.normalize_text(stage_a)
    
    # Stage C: Process with PII framework (using normalized text)
    pii_result = framework.process_transcript(stage_b)
    stage_c = pii_result['anonymized_text']
    
    # Store results
    result = {
        'call_id': call_id,
        'stage_a_original': stage_a,
        'stage_b_normalized': stage_b,
        'stage_c_cleaned': stage_c,
        'pii_detections': pii_result['pii_detections'],
        'processing_metadata': {
            'normalization_applied': stage_a != stage_b,
            'original_length': len(stage_a),
            'normalized_length': len(stage_b),
            'cleaned_length': len(stage_c)
        },
        # Ground truth (canonical values)
        'member_first_name': row['member_first_name'],
        'member_full_name': row['member_full_name'],
        'member_email': row['member_email'],
        'member_mobile': row['member_mobile'],
        'member_address': row['member_address'],
        'member_number': str(row['member_number']),
        'consultant_first_name': row['consultant_first_name']
    }
    
    three_stage_results.append(result)
    print(f"   ✅ Processed call_id {call_id}")

print(f"\n🎉 Three-stage processing complete!")


🔄 Processing 3 transcripts through three-stage workflow...
   ✅ Processed call_id 1
   ✅ Processed call_id 2
   ✅ Processed call_id 3

🎉 Three-stage processing complete!


In [None]:
# Display the three-stage workflow results as an HTML table
html = create_three_stage_html_table(
    transcript_data=three_stage_results,
    title="Three-Stage Workflow Analysis",
    description="Demonstration of original, normalized, and cleaned transcripts with PII detection results.",
    matching_mode='business',
    show_original=False
)

from IPython.display import display, HTML
display(HTML(html))