# üìä Model Evaluation - Performance Analysis

This notebook performs comprehensive evaluation of the trained sign language classifier.

## Objectives
- Load trained model and test data
- Generate predictions on test set
- Calculate detailed performance metrics
- Create confusion matrix visualization
- Analyze per-class performance
- Identify misclassifications
- Generate classification report

---

## 1. Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras

# Scikit-learn metrics
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score,
    roc_auc_score, cohen_kappa_score
)

print("‚úÖ Libraries imported successfully")

## 2. Load Model and Data

In [None]:
# Load test data
PROCESSED_DIR = 'data/processed'
MODELS_DIR = 'models/saved_models'

X_test = np.load(os.path.join(PROCESSED_DIR, 'X_test.npy'))
y_test = np.load(os.path.join(PROCESSED_DIR, 'y_test.npy'))

print("\n" + "="*60)
print("TEST DATA LOADED")
print("="*60)
print(f"Test samples: {X_test.shape[0]}")
print(f"Features: {X_test.shape[1]}")
print("="*60)

In [None]:
# Load label encoder
with open('models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

class_names = label_encoder.classes_
num_classes = len(class_names)

print(f"\nClasses: {class_names}")
print(f"Number of classes: {num_classes}")

In [None]:
# Load trained model
model = keras.models.load_model(os.path.join(MODELS_DIR, 'best_model.keras'))

print("\n‚úÖ Model loaded successfully")
print(f"   Model: {model.name}")
print(f"   Total parameters: {model.count_params():,}")

## 3. Generate Predictions

In [None]:
# Generate predictions
print("\nGenerating predictions...")
y_pred_proba = model.predict(X_test, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

print("‚úÖ Predictions generated")
print(f"   Prediction shape: {y_pred.shape}")
print(f"   Probability shape: {y_pred_proba.shape}")

## 4. Overall Performance Metrics

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
kappa = cohen_kappa_score(y_test, y_pred)

print("\n" + "="*60)
print("OVERALL PERFORMANCE METRICS")
print("="*60)
print(f"Accuracy:  {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall:    {recall*100:.2f}%")
print(f"F1-Score:  {f1*100:.2f}%")
print(f"Cohen's Kappa: {kappa:.4f}")
print("="*60)

## 5. Confusion Matrix

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={'label': 'Count'},
    linewidths=0.5,
    linecolor='gray'
)
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.title('Confusion Matrix - Test Set', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('outputs/visualizations/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrix saved to outputs/visualizations/confusion_matrix.png")

In [None]:
# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm_normalized, 
    annot=True, 
    fmt='.2f', 
    cmap='YlOrRd',
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={'label': 'Proportion'},
    linewidths=0.5,
    linecolor='gray',
    vmin=0,
    vmax=1
)
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.title('Normalized Confusion Matrix - Test Set', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('outputs/visualizations/confusion_matrix_normalized.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Normalized confusion matrix saved")

## 6. Per-Class Performance

In [None]:
# Generate classification report
report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)

# Convert to DataFrame
report_df = pd.DataFrame(report).transpose()

print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, target_names=class_names))
print("="*60)

In [None]:
# Visualize per-class metrics
metrics_df = report_df.iloc[:-3, :3]  # Exclude avg rows, keep precision/recall/f1

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Precision
axes[0].bar(metrics_df.index, metrics_df['precision'], color='skyblue', edgecolor='navy')
axes[0].set_xlabel('Class', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Precision', fontsize=11, fontweight='bold')
axes[0].set_title('Precision by Class', fontsize=12, fontweight='bold')
axes[0].set_ylim([0, 1.1])
axes[0].grid(axis='y', alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Recall
axes[1].bar(metrics_df.index, metrics_df['recall'], color='lightgreen', edgecolor='darkgreen')
axes[1].set_xlabel('Class', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Recall', fontsize=11, fontweight='bold')
axes[1].set_title('Recall by Class', fontsize=12, fontweight='bold')
axes[1].set_ylim([0, 1.1])
axes[1].grid(axis='y', alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

# F1-Score
axes[2].bar(metrics_df.index, metrics_df['f1-score'], color='lightcoral', edgecolor='darkred')
axes[2].set_xlabel('Class', fontsize=11, fontweight='bold')
axes[2].set_ylabel('F1-Score', fontsize=11, fontweight='bold')
axes[2].set_title('F1-Score by Class', fontsize=12, fontweight='bold')
axes[2].set_ylim([0, 1.1])
axes[2].grid(axis='y', alpha=0.3)
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('outputs/visualizations/per_class_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Prediction Confidence Analysis

In [None]:
# Get prediction confidences
confidences = np.max(y_pred_proba, axis=1)

# Separate correct and incorrect predictions
correct_mask = (y_pred == y_test)
correct_confidences = confidences[correct_mask]
incorrect_confidences = confidences[~correct_mask]

print("\nPrediction Confidence Analysis:")
print("="*60)
print(f"Correct predictions:   {len(correct_confidences)} ({len(correct_confidences)/len(y_test)*100:.1f}%)")
print(f"  Mean confidence: {correct_confidences.mean():.4f}")
print(f"  Std confidence:  {correct_confidences.std():.4f}")
print(f"\nIncorrect predictions: {len(incorrect_confidences)} ({len(incorrect_confidences)/len(y_test)*100:.1f}%)")
if len(incorrect_confidences) > 0:
    print(f"  Mean confidence: {incorrect_confidences.mean():.4f}")
    print(f"  Std confidence:  {incorrect_confidences.std():.4f}")
print("="*60)

In [None]:
# Visualize confidence distributions
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(correct_confidences, bins=30, color='green', alpha=0.7, edgecolor='black')
plt.xlabel('Confidence', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title('Confidence Distribution - Correct Predictions', fontsize=12, fontweight='bold')
plt.axvline(correct_confidences.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {correct_confidences.mean():.3f}')
plt.legend()
plt.grid(alpha=0.3)

if len(incorrect_confidences) > 0:
    plt.subplot(1, 2, 2)
    plt.hist(incorrect_confidences, bins=30, color='red', alpha=0.7, edgecolor='black')
    plt.xlabel('Confidence', fontsize=11)
    plt.ylabel('Frequency', fontsize=11)
    plt.title('Confidence Distribution - Incorrect Predictions', fontsize=12, fontweight='bold')
    plt.axvline(incorrect_confidences.mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {incorrect_confidences.mean():.3f}')
    plt.legend()
    plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/visualizations/confidence_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Misclassification Analysis

In [None]:
# Find misclassified samples
misclassified_indices = np.where(y_pred != y_test)[0]

if len(misclassified_indices) > 0:
    print(f"\nMisclassified Samples: {len(misclassified_indices)}")
    print("="*60)
    
    # Show first 10 misclassifications
    print("\nFirst 10 Misclassifications:")
    print(f"{'Index':<8} {'True':<10} {'Predicted':<10} {'Confidence':<12}")
    print("-" * 60)
    
    for idx in misclassified_indices[:10]:
        true_label = class_names[y_test[idx]]
        pred_label = class_names[y_pred[idx]]
        confidence = confidences[idx]
        print(f"{idx:<8} {true_label:<10} {pred_label:<10} {confidence:<12.4f}")
else:
    print("\nüéâ Perfect classification! No misclassifications found!")

## 9. Save Evaluation Results

In [None]:
# Create evaluation metadata
evaluation_metadata = {
    'evaluation_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'test_samples': int(len(y_test)),
    'num_classes': int(num_classes),
    'classes': class_names.tolist(),
    'overall_metrics': {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'cohen_kappa': float(kappa)
    },
    'per_class_metrics': report,
    'misclassified_count': int(len(misclassified_indices)),
    'correct_predictions': int(len(correct_confidences)),
    'mean_confidence_correct': float(correct_confidences.mean()),
    'mean_confidence_incorrect': float(incorrect_confidences.mean()) if len(incorrect_confidences) > 0 else None
}

# Save metadata
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f'outputs/metrics/evaluation_results_{timestamp}.json', 'w') as f:
    json.dump(evaluation_metadata, f, indent=4)

print("\n‚úÖ Evaluation results saved!")
print(f"   File: outputs/metrics/evaluation_results_{timestamp}.json")

In [None]:
# Save confusion matrix as CSV
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
cm_df.to_csv(f'outputs/metrics/confusion_matrix_{timestamp}.csv')

print("‚úÖ Confusion matrix saved as CSV")

## 10. Final Summary

In [None]:
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"\nüìä Test Set Performance:")
print(f"   Samples evaluated: {len(y_test)}")
print(f"   Accuracy: {accuracy*100:.2f}%")
print(f"   Precision: {precision*100:.2f}%")
print(f"   Recall: {recall*100:.2f}%")
print(f"   F1-Score: {f1*100:.2f}%")

print(f"\n‚úÖ Correct predictions: {len(correct_confidences)} ({len(correct_confidences)/len(y_test)*100:.1f}%)")
print(f"‚ùå Incorrect predictions: {len(misclassified_indices)} ({len(misclassified_indices)/len(y_test)*100:.1f}%)")

print(f"\nüìà Best performing class: {metrics_df['f1-score'].idxmax()} (F1: {metrics_df['f1-score'].max():.3f})")
print(f"üìâ Worst performing class: {metrics_df['f1-score'].idxmin()} (F1: {metrics_df['f1-score'].min():.3f})")

print("\n" + "="*60)
print("‚úÖ EVALUATION COMPLETE!")
print("="*60)
print("\nNext: Proceed to 06_realtime_detection.ipynb for live testing")

---

## üéØ Summary

Model evaluation completed successfully!

### What Was Done:
- ‚úÖ Generated predictions on test set
- ‚úÖ Calculated comprehensive performance metrics
- ‚úÖ Created confusion matrices (raw and normalized)
- ‚úÖ Analyzed per-class performance
- ‚úÖ Examined prediction confidence
- ‚úÖ Identified misclassifications
- ‚úÖ Saved evaluation results

### What's Next?
Proceed to **06_realtime_detection.ipynb** to:
- Test the model with live webcam feed
- Perform real-time sign language translation
- Evaluate real-world performance

---