# Error Analysis
## AI Document Intelligence - Week 1

This notebook analyzes OCR errors by comparing with ground truth labels.

## Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from src.ocr import OCRErrorAnalyzer
from src.utils.file_utils import read_json, list_files

# Configure matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

## Initialize Error Analyzer

In [None]:
dataset_path = Path.cwd().parent / "dataset"

analyzer = OCRErrorAnalyzer(
    labels_dir=str(dataset_path / "labels"),
    ocr_results_dir=str(dataset_path / "ocr_text")
)

print("Error Analyzer initialized")

## Analyze Single Document

In [None]:
# Get sample document IDs
labels_dir = dataset_path / "labels"
label_files = list_files(labels_dir, pattern="*.json", recursive=False)
label_files = [f for f in label_files if not f.name.endswith("_metadata.json")]

if label_files:
    sample_doc_id = label_files[0].stem
    print(f"Analyzing document: {sample_doc_id}")
    
    # Analyze errors
    errors = analyzer.analyze_field_errors(sample_doc_id, engine="paddle")
    
    if errors:
        print("\nField Analysis:")
        print("="*80)
        for field, error in errors['field_errors'].items():
            print(f"\n{field.upper()}:")
            print(f"  Ground Truth: {error['ground_truth']}")
            print(f"  OCR Extracted: {error['ocr_extracted']}")
            print(f"  Similarity: {error['similarity']:.2%}")
            print(f"  Correct: {'✓' if error['is_correct'] else '✗'}")
        print("="*80)
    else:
        print("No OCR results found. Run notebook 02 first.")
else:
    print("No labels found. Generate dataset first.")

## Generate Comprehensive Error Report

In [None]:
# Get all document IDs
doc_ids = [f.stem for f in label_files if not f.name.endswith("_metadata.json")]

# Filter to docs with OCR results
ocr_dir = dataset_path / "ocr_text" / "paddle"
ocr_files = list_files(ocr_dir, pattern="*_ocr.json", recursive=False)
ocr_doc_ids = [f.stem.replace("_ocr", "") for f in ocr_files]

# Only analyze docs with both labels and OCR results
available_doc_ids = [doc_id for doc_id in doc_ids if doc_id in ocr_doc_ids]

print(f"Found {len(available_doc_ids)} documents with OCR results")

if available_doc_ids:
    # Generate report
    print("\nGenerating error report...")
    report = analyzer.generate_error_report(
        engine="paddle",
        doc_ids=available_doc_ids
    )
    
    # Save report
    report_path = Path.cwd().parent / "reports" / "paddle_error_report.json"
    analyzer.save_report(report, str(report_path))
else:
    print("No OCR results available. Process images in notebook 02 first.")
    report = None

## Visualize Field Accuracy

In [None]:
if report:
    # Extract field accuracies
    field_accuracy = report['field_accuracy']
    
    # Create bar chart
    fields = [k for k in field_accuracy.keys() if k != 'overall']
    accuracies = [field_accuracy[k] for k in fields]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(fields, accuracies, color=['green' if a >= 0.85 else 'orange' if a >= 0.70 else 'red' for a in accuracies])
    plt.axhline(y=0.85, color='r', linestyle='--', label='Target Threshold (85%)')
    plt.ylim(0, 1.0)
    plt.ylabel('Accuracy')
    plt.title('Field Extraction Accuracy - PaddleOCR')
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    # Add value labels on bars
    for i, (bar, acc) in enumerate(zip(bars, accuracies)):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{acc:.1%}', ha='center', va='bottom')
    
    plt.show()
else:
    print("No report available for visualization")

## Analyze Error Patterns

In [None]:
if report and report['detailed_errors']:
    # Collect all field errors
    all_errors = []
    
    for doc_error in report['detailed_errors']:
        for field, error in doc_error['field_errors'].items():
            all_errors.append({
                'doc_id': doc_error['doc_id'],
                'field': field,
                'similarity': error['similarity'],
                'is_correct': error['is_correct']
            })
    
    # Create DataFrame
    df_errors = pd.DataFrame(all_errors)
    
    # Summary statistics
    print("\nError Pattern Analysis:")
    print("="*60)
    print("\nSimilarity Distribution by Field:")
    print(df_errors.groupby('field')['similarity'].describe())
    
    print("\nError Count by Field:")
    error_counts = df_errors[~df_errors['is_correct']].groupby('field').size()
    print(error_counts)
    
    # Visualize similarity distribution
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    fields = df_errors['field'].unique()
    for i, field in enumerate(fields[:4]):
        field_data = df_errors[df_errors['field'] == field]
        axes[i].hist(field_data['similarity'], bins=20, edgecolor='black')
        axes[i].set_title(f'{field} - Similarity Distribution')
        axes[i].set_xlabel('Similarity Score')
        axes[i].set_ylabel('Count')
        axes[i].axvline(x=0.85, color='r', linestyle='--', label='Threshold')
        axes[i].legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No detailed errors available")

## Compare with Confidence Scores

In [None]:
if report and report['detailed_errors']:
    # Extract confidence and accuracy relationship
    confidence_data = []
    
    for doc_error in report['detailed_errors']:
        avg_confidence = doc_error['avg_confidence']
        
        for field, error in doc_error['field_errors'].items():
            confidence_data.append({
                'confidence': avg_confidence,
                'similarity': error['similarity'],
                'is_correct': error['is_correct']
            })
    
    df_conf = pd.DataFrame(confidence_data)
    
    # Scatter plot
    plt.figure(figsize=(10, 6))
    colors = ['green' if correct else 'red' for correct in df_conf['is_correct']]
    plt.scatter(df_conf['confidence'], df_conf['similarity'], c=colors, alpha=0.5)
    plt.xlabel('OCR Confidence Score')
    plt.ylabel('Field Extraction Similarity')
    plt.title('OCR Confidence vs Field Extraction Accuracy')
    plt.axhline(y=0.85, color='blue', linestyle='--', label='Similarity Threshold')
    plt.legend(['Similarity Threshold', 'Correct', 'Incorrect'])
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No data available for confidence analysis")

## Key Findings Summary

In [None]:
if report:
    print("\n" + "="*80)
    print("KEY FINDINGS SUMMARY")
    print("="*80)
    
    print(f"\n1. Overall Accuracy: {report['field_accuracy']['overall']:.2%}")
    
    print("\n2. Field-Level Performance:")
    for field, accuracy in sorted(report['field_accuracy'].items(), key=lambda x: x[1], reverse=True):
        if field != 'overall':
            status = "✓" if accuracy >= 0.85 else "⚠" if accuracy >= 0.70 else "✗"
            print(f"   {status} {field}: {accuracy:.2%}")
    
    print(f"\n3. Average OCR Confidence: {report['summary']['avg_confidence']:.2%}")
    print(f"4. Average Blocks per Document: {report['summary']['avg_blocks_per_doc']:.1f}")
    
    print("\n5. Recommendations:")
    low_accuracy_fields = [f for f, a in report['field_accuracy'].items() 
                          if f != 'overall' and a < 0.85]
    if low_accuracy_fields:
        print(f"   - Focus on improving extraction for: {', '.join(low_accuracy_fields)}")
        print("   - Consider field-specific extraction patterns")
        print("   - Enhance preprocessing for better OCR quality")
    else:
        print("   - All fields meet accuracy threshold")
        print("   - Consider testing on real-world noisy documents")
    
    print("\n" + "="*80)
else:
    print("No report available for summary")

## Conclusion

This notebook analyzed OCR errors:
- Compared OCR results with ground truth labels
- Measured field-level accuracy
- Identified error patterns
- Correlated confidence scores with accuracy

Use these insights for Week 2 improvements!