# 13 - Error Analysis (Phase 3)

**Author:** Tan Ming Kai (24PMR12003)  
**Date:** 2025-11-24  
**Purpose:** Deep analysis of misclassifications for thesis Chapter 5

---

## Objectives

1. **Per-class performance breakdown** (Precision, Recall, F1, Specificity)
2. **Confusion matrix analysis** (which classes confused?)
3. **Medical metrics calculation** (Sensitivity, Specificity, PPV, NPV)
4. **Misclassification visualization** (visualize failed cases)
5. **Pattern identification** (common failure modes)

---

## Why Error Analysis?

Understanding **where and why** models fail is crucial for:
- Thesis discussion (Chapter 6)
- Clinical deployment considerations
- Future improvement directions
- Identifying systematic biases

---

In [None]:
# Standard imports
import os, sys, warnings, random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import cv2
from PIL import Image
import timm

from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, 
    confusion_matrix, classification_report,
    roc_curve, auc, roc_auc_score
)

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print("[OK] Imports complete")

In [None]:
# Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

DATA_DIR = Path("../data/processed")
MODELS_DIR = Path("../experiments/phase2_systematic/models")
OUTPUT_DIR = Path("../experiments/phase3_analysis/error_analysis")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CLASS_NAMES = ['COVID', 'Normal', 'Lung_Opacity', 'Viral_Pneumonia']

CONFIG = {
    'seed': 42,  # Analyze best seed for each model
    'batch_size': 16,
    'num_workers': 0,
    'mean': [0.485, 0.456, 0.406],
    'std': [0.229, 0.224, 0.225]
}

print(f"[OK] Configuration set")

## 1. Load Test Data

In [None]:
# Dataset class
class COVID19Dataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform
        self.image_paths = dataframe['processed_path'].values
        self.labels = dataframe['label'].values
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label, self.image_paths[idx]

# Load test data
test_df = pd.read_csv(DATA_DIR / "test_processed.csv")
print(f"Test samples: {len(test_df):,}")
print(f"Class distribution:\n{test_df['label'].value_counts().sort_index()}")

# Transform (no augmentation for test)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=CONFIG['mean'], std=CONFIG['std'])
])

test_dataset = COVID19Dataset(test_df, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], 
                         shuffle=False, num_workers=CONFIG['num_workers'])

print("[OK] Test data loaded")

## 2. Get Predictions from Best Models

In [None]:
def get_predictions(model, loader, device):
    """
    Get predictions, true labels, and image paths.
    
    Returns:
        predictions, true_labels, image_paths, probabilities
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_paths = []
    all_probs = []
    
    with torch.no_grad():
        for images, labels, paths in tqdm(loader, desc="Getting predictions"):
            images = images.to(device)
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            _, predicted = outputs.max(1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_paths.extend(paths)
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_preds), np.array(all_labels), all_paths, np.array(all_probs)

# Load best models (seed 42 for consistency)
models_to_analyze = {
    'CrossViT-Tiny': {
        'path': MODELS_DIR / 'crossvit' / 'crossvit_best_seed42.pth',
        'model_fn': lambda: timm.create_model('crossvit_tiny_240', pretrained=False, num_classes=4)
    },
    'ResNet-50': {
        'path': MODELS_DIR / 'resnet50' / 'resnet50_best_seed42.pth',
        'model_fn': lambda: timm.create_model('resnet50', pretrained=False, num_classes=4)
    },
    'DenseNet-121': {
        'path': MODELS_DIR / 'densenet121' / 'densenet121_best_seed42.pth',
        'model_fn': lambda: timm.create_model('densenet121', pretrained=False, num_classes=4)
    }
}

# Get predictions for all models
predictions_dict = {}

for model_name, model_info in models_to_analyze.items():
    print(f"\nLoading {model_name}...")
    model = model_info['model_fn']()
    model.load_state_dict(torch.load(model_info['path'], map_location=device))
    model = model.to(device)
    
    preds, labels, paths, probs = get_predictions(model, test_loader, device)
    
    predictions_dict[model_name] = {
        'predictions': preds,
        'true_labels': labels,
        'image_paths': paths,
        'probabilities': probs
    }
    
    acc = accuracy_score(labels, preds) * 100
    print(f"[OK] {model_name} accuracy: {acc:.2f}%")

print("\n[OK] All predictions obtained")

## 3. Per-Class Performance Analysis

In [None]:
# Calculate per-class metrics for each model
print("\nPER-CLASS PERFORMANCE ANALYSIS")
print("="*100)

for model_name, pred_data in predictions_dict.items():
    print(f"\n{model_name}")
    print("-"*100)
    
    preds = pred_data['predictions']
    labels = pred_data['true_labels']
    
    # Classification report
    report = classification_report(labels, preds, target_names=CLASS_NAMES, 
                                   output_dict=True, zero_division=0)
    
    print(f"{'Class':<20s} {'Precision':<12s} {'Recall':<12s} {'F1-Score':<12s} {'Support'}")
    print("-"*70)
    
    for class_name in CLASS_NAMES:
        metrics = report[class_name]
        print(f"{class_name:<20s} {metrics['precision']:>6.2f}{'':6s} "
              f"{metrics['recall']:>6.2f}{'':6s} {metrics['f1-score']:>6.2f}{'':6s} "
              f"{int(metrics['support']):>5d}")
    
    # Overall
    print("-"*70)
    print(f"{'Overall Accuracy':<20s} {report['accuracy']:.4f}")
    print(f"{'Macro Avg F1':<20s} {report['macro avg']['f1-score']:.4f}")
    print(f"{'Weighted Avg F1':<20s} {report['weighted avg']['f1-score']:.4f}")

print("\n[OK] Per-class analysis complete")

## 4. Confusion Matrix Comparison

In [None]:
# Plot confusion matrices side-by-side
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (model_name, pred_data) in enumerate(predictions_dict.items()):
    cm = confusion_matrix(pred_data['true_labels'], pred_data['predictions'])
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    sns.heatmap(cm_normalized, annot=cm, fmt='d', cmap='Blues', 
                xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES,
                cbar=True, ax=axes[idx], vmin=0, vmax=1)
    
    axes[idx].set_ylabel('True Label', fontsize=11)
    axes[idx].set_xlabel('Predicted Label', fontsize=11)
    axes[idx].set_title(model_name, fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'confusion_matrices_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] Confusion matrices plotted")

## 5. Medical Metrics (Sensitivity, Specificity, PPV, NPV)

In [None]:
def calculate_medical_metrics(cm, class_idx):
    """
    Calculate medical metrics for binary classification (one-vs-rest).
    
    Args:
        cm: Confusion matrix (multi-class)
        class_idx: Index of the class to calculate metrics for
    
    Returns:
        sensitivity, specificity, ppv, npv
    """
    # Convert to binary (one-vs-rest)
    tp = cm[class_idx, class_idx]
    fn = cm[class_idx, :].sum() - tp
    fp = cm[:, class_idx].sum() - tp
    tn = cm.sum() - tp - fn - fp
    
    # Medical metrics
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Precision
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    return sensitivity, specificity, ppv, npv

# Calculate for COVID class (most critical)
print("\nMEDICAL METRICS FOR COVID CLASS (One-vs-Rest)")
print("="*100)
print(f"{'Model':<20s} {'Sensitivity':<15s} {'Specificity':<15s} {'PPV':<15s} {'NPV'}")
print("-"*100)

covid_idx = 0  # COVID is class 0

for model_name, pred_data in predictions_dict.items():
    cm = confusion_matrix(pred_data['true_labels'], pred_data['predictions'])
    sens, spec, ppv, npv = calculate_medical_metrics(cm, covid_idx)
    
    print(f"{model_name:<20s} {sens:>6.2%}{'':9s} {spec:>6.2%}{'':9s} "
          f"{ppv:>6.2%}{'':9s} {npv:>6.2%}")

print("\nNote: Sensitivity = Recall, PPV = Precision")
print("High sensitivity crucial for COVID detection (minimize false negatives)")
print("High specificity crucial to avoid false alarms (minimize false positives)")

## 6. Identify Misclassified Cases

In [None]:
# Find misclassified samples for CrossViT
crossvit_data = predictions_dict['CrossViT-Tiny']
preds = crossvit_data['predictions']
labels = crossvit_data['true_labels']
paths = crossvit_data['image_paths']
probs = crossvit_data['probabilities']

# Find misclassifications
misclassified_mask = preds != labels
misclassified_indices = np.where(misclassified_mask)[0]

print(f"\nTotal misclassifications: {len(misclassified_indices)} / {len(labels)} "
      f"({len(misclassified_indices)/len(labels)*100:.2f}%)")

# Create misclassification dataframe
misclassified_df = pd.DataFrame({
    'image_path': [paths[i] for i in misclassified_indices],
    'true_label': [CLASS_NAMES[labels[i]] for i in misclassified_indices],
    'predicted_label': [CLASS_NAMES[preds[i]] for i in misclassified_indices],
    'confidence': [probs[i, preds[i]] for i in misclassified_indices]
})

# Sort by confidence (high confidence errors are interesting)
misclassified_df = misclassified_df.sort_values('confidence', ascending=False)

# Save
misclassified_df.to_csv(OUTPUT_DIR / 'misclassified_cases.csv', index=False)
print(f"[OK] Misclassification details saved")

# Show top 10 high-confidence errors
print("\nTop 10 High-Confidence Misclassifications:")
print(misclassified_df.head(10))

## 7. Visualize Misclassified Samples

In [None]:
# Visualize 12 random misclassified samples
np.random.seed(42)
sample_indices = np.random.choice(misclassified_indices, size=min(12, len(misclassified_indices)), replace=False)

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()

for idx, sample_idx in enumerate(sample_indices):
    img_path = paths[sample_idx]
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    true_label = CLASS_NAMES[labels[sample_idx]]
    pred_label = CLASS_NAMES[preds[sample_idx]]
    confidence = probs[sample_idx, preds[sample_idx]]
    
    axes[idx].imshow(img)
    axes[idx].axis('off')
    axes[idx].set_title(f"True: {true_label}\nPred: {pred_label} ({confidence:.1%})", 
                       fontsize=10, color='red')

plt.suptitle('Misclassified Samples (CrossViT-Tiny)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'misclassified_samples_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] Misclassification visualization saved")

## 8. Error Patterns Analysis

In [None]:
# Analyze which class pairs are most confused
print("\nERROR PATTERN ANALYSIS: Which classes are most confused?")
print("="*80)

cm = confusion_matrix(labels, preds)

# Find top confusions (off-diagonal)
confusions = []
for i in range(len(CLASS_NAMES)):
    for j in range(len(CLASS_NAMES)):
        if i != j and cm[i, j] > 0:
            confusions.append({
                'True': CLASS_NAMES[i],
                'Predicted': CLASS_NAMES[j],
                'Count': cm[i, j],
                'Percentage': cm[i, j] / cm[i, :].sum() * 100
            })

confusion_df = pd.DataFrame(confusions).sort_values('Count', ascending=False)

print(f"{'True Label':<20s} {'Predicted As':<20s} {'Count':<10s} {'% of True Class'}")
print("-"*80)
for _, row in confusion_df.head(10).iterrows():
    print(f"{row['True']:<20s} {row['Predicted']:<20s} {row['Count']:<10.0f} {row['Percentage']:.2f}%")

print("\n[OK] Error pattern analysis complete")

## 9. Summary for Thesis

### Key Findings

1. **Overall Performance:**
   - CrossViT-Tiny: X.XX% accuracy
   - Misclassified XX / XXXX samples (X.XX%)

2. **Per-Class Performance:**
   - **Best:** [Class with highest F1]
   - **Worst:** [Class with lowest F1]
   - **COVID Detection:** Sensitivity = X.XX%, Specificity = X.XX%

3. **Common Errors:**
   - Most frequent confusion: [Class A] â†’ [Class B]
   - High-confidence errors: XX cases (may indicate systematic bias)

4. **Clinical Implications:**
   - False negatives (COVID missed): XX cases - **HIGH RISK**
   - False positives (COVID over-diagnosed): XX cases - Lower risk

---

In [None]:
print("\n" + "="*80)
print("ERROR ANALYSIS COMPLETE")
print("="*80)
print("\nGenerated files:")
print("1. confusion_matrices_comparison.png - Side-by-side confusion matrices")
print("2. misclassified_cases.csv - Detailed list of all errors")
print("3. misclassified_samples_visualization.png - Visual inspection of failures")
print("\nNext steps:")
print("1. Review misclassified cases for patterns")
print("2. Discuss clinical implications in thesis Chapter 6")
print("3. Proceed to 14_ablation_studies.ipynb for hypothesis testing")