# EEG Person Identification - Part 3: Performance Analysis & Visualization

## Overview
This notebook provides comprehensive analysis of the trained CNN+RNN model:
1. Load trained model and test results
2. Detailed confusion matrix analysis
3. Per-subject performance breakdown
4. Feature visualization with t-SNE
5. Error analysis and discussion
6. Model interpretability insights

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import h5py
import json
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow import keras

# Metrics and Visualization
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    accuracy_score, f1_score, precision_recall_fscore_support
)
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Set random seeds
np.random.seed(42)

# Configure matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("Libraries imported successfully!")

## 1. Load Model and Data

In [None]:
# Define paths
DATA_FILE = 'data/processed/preprocessed_data.h5'
MODEL_DIR = 'models'
FIGURES_DIR = 'figures'

# Create figures directory
os.makedirs(FIGURES_DIR, exist_ok=True)

# Find the most recent model
model_files = [f for f in os.listdir(MODEL_DIR) if f.endswith('.keras')]
if not model_files:
    raise FileNotFoundError("No trained model found. Please run 2_model_training.ipynb first.")

latest_model = sorted(model_files)[-1]
model_path = os.path.join(MODEL_DIR, latest_model)
results_file = model_path.replace('.keras', '_results.json')

print(f"Loading model: {latest_model}")

# Load model
model = keras.models.load_model(model_path)
print("Model loaded successfully!")

# Load results if available
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        results = json.load(f)
    print(f"\nPrevious Results:")
    print(f"  Test Accuracy: {results['test_metrics']['accuracy']:.4f}")
    print(f"  F1-Score (Macro): {results['test_metrics']['f1_macro']:.4f}")

# Load test data
print("\nLoading test data...")
with h5py.File(DATA_FILE, 'r') as f:
    X_test = f['X_test'][:]
    y_test = f['y_test'][:]
    n_subjects = f.attrs['n_subjects']

# Add channel dimension if needed
if X_test.ndim == 4:
    X_test = X_test[..., np.newaxis]

print(f"Test data shape: {X_test.shape}")
print(f"Number of subjects: {n_subjects}")

## 2. Generate Predictions

In [None]:
# Generate predictions
print("Generating predictions on test set...")
y_pred_probs = model.predict(X_test, batch_size=32, verbose=1)
y_pred = np.argmax(y_pred_probs, axis=1)

# Get top-5 predictions for each sample
top5_pred = np.argsort(y_pred_probs, axis=1)[:, -5:][:, ::-1]

# Calculate top-5 accuracy
top5_correct = np.array([y_test[i] in top5_pred[i] for i in range(len(y_test))])
top5_accuracy = top5_correct.mean()

print(f"\nPredictions complete!")
print(f"  Test samples: {len(y_test)}")
print(f"  Top-5 Accuracy: {top5_accuracy:.4f} ({top5_accuracy*100:.2f}%)")

## 3. Overall Performance Metrics

In [None]:
# Calculate comprehensive metrics
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Calculate per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred, average=None, zero_division=0
)

# Display results
print("="*70)
print("OVERALL PERFORMANCE METRICS")
print("="*70)
print(f"\nClassification Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Top-5 Accuracy: {top5_accuracy:.4f} ({top5_accuracy*100:.2f}%)")
print(f"\nF1-Score Metrics:")
print(f"  Macro Average: {f1_macro:.4f}")
print(f"  Micro Average: {f1_micro:.4f}")
print(f"  Weighted Average: {f1_weighted:.4f}")
print(f"\nPer-Class Statistics:")
print(f"  Mean Precision: {precision.mean():.4f} ± {precision.std():.4f}")
print(f"  Mean Recall: {recall.mean():.4f} ± {recall.std():.4f}")
print(f"  Mean F1-Score: {f1.mean():.4f} ± {f1.std():.4f}")
print("="*70)

# Create metrics visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Accuracy comparison
metrics_names = ['Top-1\nAccuracy', 'Top-5\nAccuracy']
metrics_values = [accuracy, top5_accuracy]
colors = ['#3498db', '#2ecc71']

axes[0].bar(metrics_names, metrics_values, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
axes[0].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0].set_title('Accuracy Metrics', fontsize=14, fontweight='bold')
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(metrics_values):
    axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=12, fontweight='bold')

# F1-Score comparison
f1_names = ['Macro', 'Micro', 'Weighted']
f1_values = [f1_macro, f1_micro, f1_weighted]
colors = ['#e74c3c', '#f39c12', '#9b59b6']

axes[1].bar(f1_names, f1_values, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
axes[1].set_ylabel('F1-Score', fontsize=12, fontweight='bold')
axes[1].set_title('F1-Score Variants', fontsize=14, fontweight='bold')
axes[1].set_ylim([0, 1])
axes[1].grid(axis='y', alpha=0.3)
for i, v in enumerate(f1_values):
    axes[1].text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=12, fontweight='bold')

# Distribution of per-class F1-scores
axes[2].hist(f1, bins=30, alpha=0.7, edgecolor='black', color='#16a085')
axes[2].axvline(f1.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {f1.mean():.3f}')
axes[2].set_xlabel('F1-Score', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[2].set_title('Distribution of Per-Subject F1-Scores', fontsize=14, fontweight='bold')
axes[2].legend(fontsize=10)
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'overall_metrics.png'), dpi=150, bbox_inches='tight')
plt.show()

## 4. Detailed Confusion Matrix Analysis

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot confusion matrix with better visualization
fig, axes = plt.subplots(1, 2, figsize=(24, 10))

# Absolute counts
im1 = axes[0].imshow(cm, cmap='Blues', aspect='auto', interpolation='nearest')
axes[0].set_xlabel('Predicted Subject ID', fontsize=14, fontweight='bold')
axes[0].set_ylabel('True Subject ID', fontsize=14, fontweight='bold')
axes[0].set_title('Confusion Matrix (Absolute Counts)', fontsize=16, fontweight='bold')
cbar1 = plt.colorbar(im1, ax=axes[0])
cbar1.set_label('Count', fontsize=12)

# Normalized (percentages)
im2 = axes[1].imshow(cm_normalized, cmap='RdYlGn', aspect='auto', 
                     interpolation='nearest', vmin=0, vmax=1)
axes[1].set_xlabel('Predicted Subject ID', fontsize=14, fontweight='bold')
axes[1].set_ylabel('True Subject ID', fontsize=14, fontweight='bold')
axes[1].set_title('Confusion Matrix (Normalized)', fontsize=16, fontweight='bold')
cbar2 = plt.colorbar(im2, ax=axes[1])
cbar2.set_label('Proportion', fontsize=12)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'confusion_matrix_detailed.png'), dpi=150, bbox_inches='tight')
plt.show()

# Analyze most confused pairs
print("\nMost Confused Subject Pairs:")
print("="*60)
confusion_pairs = []
for i in range(n_subjects):
    for j in range(n_subjects):
        if i != j and cm[i, j] > 0:
            confusion_pairs.append((i, j, cm[i, j]))

confusion_pairs.sort(key=lambda x: x[2], reverse=True)
for i, (true_sub, pred_sub, count) in enumerate(confusion_pairs[:10]):
    print(f"{i+1}. Subject {true_sub} → Subject {pred_sub}: {count} times")
print("="*60)

## 5. Per-Subject Performance Analysis

In [None]:
# Create detailed per-subject analysis
subject_metrics = []
for subject_id in range(n_subjects):
    # Get samples for this subject
    subject_mask = (y_test == subject_id)
    if subject_mask.sum() == 0:
        continue
    
    subject_metrics.append({
        'Subject_ID': subject_id,
        'Test_Samples': support[subject_id],
        'Precision': precision[subject_id],
        'Recall': recall[subject_id],
        'F1_Score': f1[subject_id],
        'Correct_Predictions': cm[subject_id, subject_id],
        'Accuracy': cm[subject_id, subject_id] / support[subject_id] if support[subject_id] > 0 else 0
    })

# Create DataFrame
df_metrics = pd.DataFrame(subject_metrics)

# Summary statistics
print("\nPer-Subject Performance Summary:")
print("="*70)
print(df_metrics.describe())
print("="*70)

# Identify best and worst performing subjects
best_subjects = df_metrics.nlargest(10, 'F1_Score')
worst_subjects = df_metrics.nsmallest(10, 'F1_Score')

print("\nTop 10 Best Performing Subjects:")
print(best_subjects.to_string(index=False))

print("\nTop 10 Worst Performing Subjects:")
print(worst_subjects.to_string(index=False))

# Visualize per-subject metrics
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# F1-Score by subject
axes[0, 0].bar(df_metrics['Subject_ID'], df_metrics['F1_Score'], 
               alpha=0.7, edgecolor='black', color='steelblue')
axes[0, 0].axhline(df_metrics['F1_Score'].mean(), color='red', 
                   linestyle='--', linewidth=2, label=f'Mean: {df_metrics["F1_Score"].mean():.3f}')
axes[0, 0].set_xlabel('Subject ID', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('F1-Score', fontsize=12, fontweight='bold')
axes[0, 0].set_title('F1-Score per Subject', fontsize=14, fontweight='bold')
axes[0, 0].legend(fontsize=10)
axes[0, 0].grid(axis='y', alpha=0.3)

# Precision vs Recall
axes[0, 1].scatter(df_metrics['Recall'], df_metrics['Precision'], 
                   alpha=0.6, s=100, c=df_metrics['F1_Score'], 
                   cmap='viridis', edgecolors='black', linewidth=1)
axes[0, 1].plot([0, 1], [0, 1], 'r--', alpha=0.3, linewidth=2)
axes[0, 1].set_xlabel('Recall', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Precision', fontsize=12, fontweight='bold')
axes[0, 1].set_title('Precision vs Recall (colored by F1)', fontsize=14, fontweight='bold')
axes[0, 1].grid(alpha=0.3)
cbar = plt.colorbar(axes[0, 1].collections[0], ax=axes[0, 1])
cbar.set_label('F1-Score', fontsize=10)

# Accuracy vs Test Samples
axes[1, 0].scatter(df_metrics['Test_Samples'], df_metrics['Accuracy'], 
                   alpha=0.6, s=100, c='coral', edgecolors='black', linewidth=1)
axes[1, 0].set_xlabel('Number of Test Samples', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Accuracy', fontsize=12, fontweight='bold')
axes[1, 0].set_title('Accuracy vs Sample Size', fontsize=14, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Distribution of metrics
metrics_to_plot = ['Precision', 'Recall', 'F1_Score']
df_metrics[metrics_to_plot].boxplot(ax=axes[1, 1], patch_artist=True)
axes[1, 1].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[1, 1].set_title('Distribution of Performance Metrics', fontsize=14, fontweight='bold')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'per_subject_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

# Save metrics to CSV
df_metrics.to_csv(os.path.join(FIGURES_DIR, 'per_subject_metrics.csv'), index=False)
print(f"\nPer-subject metrics saved to: {os.path.join(FIGURES_DIR, 'per_subject_metrics.csv')}")

## 6. Feature Visualization with t-SNE

Visualize the learned representations using t-SNE dimensionality reduction.

In [None]:
# Extract features from the model (before final classification layer)
print("Extracting features from model...")
feature_extractor = keras.Model(
    inputs=model.input,
    outputs=model.layers[-3].output  # Layer before final dense layer
)

# Extract features for a subset of test data (for visualization efficiency)
n_samples_per_subject = 5
sample_indices = []
for subject_id in range(n_subjects):
    subject_mask = (y_test == subject_id)
    subject_indices = np.where(subject_mask)[0]
    if len(subject_indices) >= n_samples_per_subject:
        sample_indices.extend(np.random.choice(subject_indices, n_samples_per_subject, replace=False))
    else:
        sample_indices.extend(subject_indices)

X_sample = X_test[sample_indices]
y_sample = y_test[sample_indices]

print(f"Extracting features for {len(X_sample)} samples...")
features = feature_extractor.predict(X_sample, batch_size=32, verbose=1)
print(f"Feature shape: {features.shape}")

In [None]:
# Apply PCA first to reduce dimensionality (faster t-SNE)
print("\nApplying PCA for initial dimensionality reduction...")
pca = PCA(n_components=50)
features_pca = pca.fit_transform(features)
print(f"PCA variance explained: {pca.explained_variance_ratio_.sum():.3f}")

# Apply t-SNE
print("\nApplying t-SNE (this may take several minutes)...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000, verbose=1)
features_tsne = tsne.fit_transform(features_pca)
print("t-SNE complete!")

In [None]:
# Visualize t-SNE embedding
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot 1: Colored by subject (with legend for subset)
unique_subjects = np.unique(y_sample)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_subjects)))

for idx, subject_id in enumerate(unique_subjects[:20]):  # Show first 20 in legend
    mask = (y_sample == subject_id)
    axes[0].scatter(features_tsne[mask, 0], features_tsne[mask, 1], 
                   c=[colors[idx]], label=f'Subject {subject_id}', 
                   alpha=0.6, s=50, edgecolors='black', linewidth=0.5)

# Plot remaining subjects without legend
for subject_id in unique_subjects[20:]:
    mask = (y_sample == subject_id)
    axes[0].scatter(features_tsne[mask, 0], features_tsne[mask, 1], 
                   alpha=0.4, s=50, edgecolors='black', linewidth=0.5)

axes[0].set_xlabel('t-SNE Dimension 1', fontsize=12, fontweight='bold')
axes[0].set_ylabel('t-SNE Dimension 2', fontsize=12, fontweight='bold')
axes[0].set_title('t-SNE Visualization of Learned Features\n(Colored by Subject)', 
                 fontsize=14, fontweight='bold')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8, ncol=2)
axes[0].grid(alpha=0.3)

# Plot 2: Heatmap visualization
from scipy.stats import gaussian_kde
xy = features_tsne.T
z = gaussian_kde(xy)(xy)

scatter = axes[1].scatter(features_tsne[:, 0], features_tsne[:, 1], 
                         c=z, s=50, cmap='viridis', alpha=0.6, 
                         edgecolors='black', linewidth=0.5)
axes[1].set_xlabel('t-SNE Dimension 1', fontsize=12, fontweight='bold')
axes[1].set_ylabel('t-SNE Dimension 2', fontsize=12, fontweight='bold')
axes[1].set_title('t-SNE Visualization\n(Density Heatmap)', 
                 fontsize=14, fontweight='bold')
plt.colorbar(scatter, ax=axes[1], label='Density')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'tsne_visualization.png'), dpi=150, bbox_inches='tight')
plt.show()

print("\nt-SNE visualization saved!")

## 7. Error Analysis

Analyze misclassified samples to understand model limitations.

In [None]:
# Find misclassified samples
misclassified_mask = (y_test != y_pred)
misclassified_indices = np.where(misclassified_mask)[0]
correctly_classified_mask = (y_test == y_pred)

print(f"Error Analysis:")
print("="*70)
print(f"Total test samples: {len(y_test)}")
print(f"Correctly classified: {correctly_classified_mask.sum()} ({correctly_classified_mask.sum()/len(y_test)*100:.2f}%)")
print(f"Misclassified: {len(misclassified_indices)} ({len(misclassified_indices)/len(y_test)*100:.2f}%)")
print("="*70)

# Analyze confidence of predictions
pred_confidence = y_pred_probs.max(axis=1)
correct_confidence = pred_confidence[correctly_classified_mask]
incorrect_confidence = pred_confidence[misclassified_mask]

print(f"\nPrediction Confidence:")
print(f"  Correct predictions - Mean: {correct_confidence.mean():.3f}, Std: {correct_confidence.std():.3f}")
print(f"  Incorrect predictions - Mean: {incorrect_confidence.mean():.3f}, Std: {incorrect_confidence.std():.3f}")

# Visualize confidence distributions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Confidence histogram
axes[0].hist(correct_confidence, bins=50, alpha=0.7, label='Correct', color='green', edgecolor='black')
axes[0].hist(incorrect_confidence, bins=50, alpha=0.7, label='Incorrect', color='red', edgecolor='black')
axes[0].set_xlabel('Prediction Confidence', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(axis='y', alpha=0.3)

# Confidence vs correctness
bins = np.linspace(0, 1, 11)
bin_centers = (bins[:-1] + bins[1:]) / 2
correct_by_conf = []
total_by_conf = []

for i in range(len(bins)-1):
    mask = (pred_confidence >= bins[i]) & (pred_confidence < bins[i+1])
    if mask.sum() > 0:
        correct_by_conf.append((y_test[mask] == y_pred[mask]).sum())
        total_by_conf.append(mask.sum())
    else:
        correct_by_conf.append(0)
        total_by_conf.append(1)  # Avoid division by zero

accuracy_by_conf = np.array(correct_by_conf) / np.array(total_by_conf)

axes[1].plot(bin_centers, accuracy_by_conf, marker='o', linewidth=2, markersize=8, color='steelblue')
axes[1].plot([0, 1], [0, 1], 'r--', alpha=0.5, linewidth=2, label='Perfect Calibration')
axes[1].set_xlabel('Prediction Confidence', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Accuracy', fontsize=12, fontweight='bold')
axes[1].set_title('Model Calibration Curve', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'error_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Analyze error patterns by subject
error_rate_by_subject = []
for subject_id in range(n_subjects):
    subject_mask = (y_test == subject_id)
    if subject_mask.sum() > 0:
        error_rate = (y_pred[subject_mask] != subject_id).sum() / subject_mask.sum()
        error_rate_by_subject.append(error_rate)
    else:
        error_rate_by_subject.append(0)

# Plot error rates
plt.figure(figsize=(16, 6))
plt.bar(range(n_subjects), error_rate_by_subject, alpha=0.7, edgecolor='black', color='coral')
plt.axhline(np.mean(error_rate_by_subject), color='red', linestyle='--', 
            linewidth=2, label=f'Mean Error Rate: {np.mean(error_rate_by_subject):.3f}')
plt.xlabel('Subject ID', fontsize=12, fontweight='bold')
plt.ylabel('Error Rate', fontsize=12, fontweight='bold')
plt.title('Error Rate per Subject', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'error_rate_by_subject.png'), dpi=150, bbox_inches='tight')
plt.show()

## 8. Discussion and Insights

In [None]:
# Generate comprehensive discussion report
discussion = f"""
{'='*80}
PERFORMANCE ANALYSIS AND DISCUSSION
{'='*80}

1. OVERALL MODEL PERFORMANCE
   - Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)
   - Top-5 Accuracy: {top5_accuracy:.4f} ({top5_accuracy*100:.2f}%)
   - F1-Score (Macro): {f1_macro:.4f}
   
   The model demonstrates {'strong' if accuracy > 0.8 else 'moderate' if accuracy > 0.6 else 'weak'} 
   performance in identifying individuals from EEG patterns.

2. PER-SUBJECT VARIABILITY
   - Best performing subject F1-score: {df_metrics['F1_Score'].max():.4f}
   - Worst performing subject F1-score: {df_metrics['F1_Score'].min():.4f}
   - Standard deviation of F1-scores: {df_metrics['F1_Score'].std():.4f}
   
   {'High' if df_metrics['F1_Score'].std() > 0.15 else 'Moderate' if df_metrics['F1_Score'].std() > 0.08 else 'Low'} 
   variability across subjects suggests that some individuals have more distinctive 
   EEG patterns than others.

3. CONFUSION PATTERNS
   - Total misclassifications: {len(misclassified_indices)}
   - Most confused pair: Subject {confusion_pairs[0][0]} → Subject {confusion_pairs[0][1]} 
     ({confusion_pairs[0][2]} times)
   
   The confusion matrix reveals specific subject pairs with similar EEG signatures.

4. MODEL CONFIDENCE AND CALIBRATION
   - Mean confidence (correct): {correct_confidence.mean():.3f}
   - Mean confidence (incorrect): {incorrect_confidence.mean():.3f}
   
   {'Good' if correct_confidence.mean() - incorrect_confidence.mean() > 0.15 else 'Moderate'} 
   separation between correct and incorrect prediction confidences indicates 
   {'well' if correct_confidence.mean() - incorrect_confidence.mean() > 0.15 else 'moderately'}-calibrated predictions.

5. KEY INSIGHTS
   a) The CNN+RNN hybrid architecture effectively captures both spatial-spectral 
      features (via CNN) and temporal dependencies (via LSTM) in EEG signals.
   
   b) Top-5 accuracy of {top5_accuracy*100:.2f}% suggests the model narrows down 
      candidates effectively, useful for identification in constrained scenarios.
   
   c) t-SNE visualization shows {'clear' if accuracy > 0.8 else 'some'} clustering 
      of subjects in the learned feature space, validating that the model learns 
      discriminative representations.

6. POTENTIAL IMPROVEMENTS
   - Data augmentation (time-warping, frequency masking)
   - Attention mechanisms to focus on discriminative time/frequency regions
   - Multi-session training to improve cross-session generalization
   - Subject-specific fine-tuning for difficult cases
   - Ensemble methods combining multiple model architectures

7. PRACTICAL APPLICATIONS
   - Biometric authentication using EEG signals
   - Security systems requiring brainwave-based identification
   - Research on individual differences in brain activity patterns
   - Quality control in neuroscience studies (subject identification)

{'='*80}
"""

print(discussion)

# Save discussion to file
with open(os.path.join(FIGURES_DIR, 'performance_discussion.txt'), 'w') as f:
    f.write(discussion)

print(f"\nDiscussion saved to: {os.path.join(FIGURES_DIR, 'performance_discussion.txt')}")

## 9. Generate Final Summary Report

In [None]:
# Create comprehensive summary report
summary_report = {
    'model_info': {
        'architecture': 'CNN + RNN (Bidirectional LSTM)',
        'model_name': latest_model,
        'n_subjects': n_subjects,
        'total_parameters': model.count_params()
    },
    'performance_metrics': {
        'test_accuracy': float(accuracy),
        'top5_accuracy': float(top5_accuracy),
        'f1_macro': float(f1_macro),
        'f1_micro': float(f1_micro),
        'f1_weighted': float(f1_weighted)
    },
    'per_subject_stats': {
        'mean_precision': float(precision.mean()),
        'mean_recall': float(recall.mean()),
        'mean_f1': float(f1.mean()),
        'std_f1': float(f1.std()),
        'min_f1': float(f1.min()),
        'max_f1': float(f1.max())
    },
    'error_analysis': {
        'total_errors': int(len(misclassified_indices)),
        'error_rate': float(len(misclassified_indices) / len(y_test)),
        'mean_confidence_correct': float(correct_confidence.mean()),
        'mean_confidence_incorrect': float(incorrect_confidence.mean())
    },
    'best_subjects': best_subjects[['Subject_ID', 'F1_Score']].to_dict('records')[:5],
    'worst_subjects': worst_subjects[['Subject_ID', 'F1_Score']].to_dict('records')[:5]
}

# Save summary report
summary_file = os.path.join(FIGURES_DIR, 'final_summary_report.json')
with open(summary_file, 'w') as f:
    json.dump(summary_report, f, indent=4)

print("\n" + "="*80)
print("FINAL SUMMARY REPORT")
print("="*80)
print(json.dumps(summary_report, indent=2))
print("="*80)
print(f"\nFull report saved to: {summary_file}")
print(f"All visualizations saved to: {FIGURES_DIR}")
print("\nPerformance analysis complete!")

## Summary

### Performance Analysis Complete!

**What we accomplished:**
1. Comprehensive evaluation of the CNN+RNN model on test data
2. Detailed confusion matrix analysis showing classification patterns
3. Per-subject performance breakdown identifying strengths/weaknesses
4. t-SNE visualization of learned feature representations
5. Error analysis examining misclassifications and model confidence
6. Discussion of results and potential improvements

**Generated Outputs:**
- Multiple visualization plots saved in `figures/` directory
- Per-subject metrics CSV file
- Performance discussion text file
- Comprehensive JSON summary report

**Next Steps:**
- Review visualizations to understand model behavior
- Identify areas for improvement (data augmentation, architecture changes)
- Consider ensemble methods or transfer learning
- Test on cross-session data for generalization analysis

This completes the EEG person identification project!