# Let's test the model

In [1]:
import tensorflow as tf
import pickle

2025-07-19 19:23:41.646040: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-19 19:23:41.832497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752971021.951604  251289 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752971021.976309  251289 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752971022.089398  251289 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import os
import cv2
from tensorflow.keras.preprocessing import image

In [4]:
model = tf.keras.models.load_model('models/emotion_model.h5')

with open('models/model_metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

class_names = metadata['class_names']
img_height = metadata['img_height']
img_width = metadata['img_width']

print(f"Model loaded with class names: {class_names}")

I0000 00:00:1752971278.166656  251289 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 555 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


Model loaded with class names: ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [None]:
# Load test dataset
test_data_dir = 'raw_data/archive/test'
batch_size = 32

# Create test dataset
test_dataset = tf.keras.utils.image_dataset_from_directory(
    test_data_dir,
    color_mode='grayscale',
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=False  # Don't shuffle for consistent evaluation
)

print(f"Test dataset class names: {test_dataset.class_names}")
print(f"Number of test batches: {len(test_dataset)}")

# Since disgust files are already removed, we can use the dataset directly
test_dataset_filtered = test_dataset

print(f"Using class names: {class_names}")
print(f"Expected classes: {len(class_names)}")
print(f"Dataset classes: {len(test_dataset.class_names)}")

# Verify class alignment
if len(test_dataset.class_names) == len(class_names):
    print("✓ Class names match between model and dataset")
else:
    print("⚠️ Warning: Mismatch between model classes and dataset classes")
    print(f"Model classes: {class_names}")
    print(f"Dataset classes: {test_dataset.class_names}")

Found 7067 files belonging to 6 classes.
Test dataset class names: ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']
Number of test batches: 221
Test dataset class names: ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']
Number of test batches: 221


ValueError: Invalid `predicate`. `predicate` must return a `tf.bool` scalar tensor, but its return type is TensorSpec(shape=(None,), dtype=tf.bool, name=None).

In [None]:
# Generate predictions
print("Generating predictions on test set...")
y_true = []
y_pred = []

for images, labels in test_dataset_filtered:
    predictions = model.predict(images, verbose=0)
    predicted_classes = np.argmax(predictions, axis=1)
    
    y_true.extend(labels.numpy())
    y_pred.extend(predicted_classes)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

print(f"Total test samples: {len(y_true)}")
print(f"Unique true labels: {np.unique(y_true)}")
print(f"Unique predicted labels: {np.unique(y_pred)}")

# Calculate key metrics
accuracy = accuracy_score(y_true, y_pred)
print(f"\nOverall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Detailed classification report
print("\n" + "="*50)
print("CLASSIFICATION REPORT")
print("="*50)
print(classification_report(y_true, y_pred, target_names=class_names))

In [None]:
# Show sample predictions
def show_sample_predictions(num_samples=12):
    fig, axes = plt.subplots(3, 4, figsize=(15, 12))
    fig.suptitle('Sample Predictions vs Ground Truth', fontsize=16, fontweight='bold')
    
    sample_indices = np.random.choice(len(y_true), num_samples, replace=False)
    
    # Get sample images and labels
    sample_images = []
    sample_true = []
    sample_pred = []
    
    current_idx = 0
    for images, labels in test_dataset_filtered:
        for i in range(len(images)):
            if current_idx in sample_indices:
                sample_images.append(images[i].numpy())
                sample_true.append(labels[i].numpy())
                sample_pred.append(y_pred[current_idx])
            current_idx += 1
            if len(sample_images) >= num_samples:
                break
        if len(sample_images) >= num_samples:
            break
    
    for idx in range(num_samples):
        row = idx // 4
        col = idx % 4
        
        # Display image
        axes[row, col].imshow(sample_images[idx].squeeze(), cmap='gray')
        
        # Create title with prediction info
        true_label = class_names[sample_true[idx]]
        pred_label = class_names[sample_pred[idx]]
        
        if sample_true[idx] == sample_pred[idx]:
            title_color = 'green'
            result = '✓'
        else:
            title_color = 'red'
            result = '✗'
        
        title = f"{result} True: {true_label}\nPred: {pred_label}"
        axes[row, col].set_title(title, fontsize=10, color=title_color, fontweight='bold')
        axes[row, col].axis('off')
    
    plt.tight_layout()
    plt.show()

show_sample_predictions()

In [None]:
# Generate and plot confusion matrix
def plot_confusion_matrix():
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Raw counts
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names, ax=ax1)
    ax1.set_title('Confusion Matrix (Raw Counts)', fontweight='bold', fontsize=14)
    ax1.set_xlabel('Predicted Label')
    ax1.set_ylabel('True Label')
    
    # Plot 2: Normalized (percentages)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names, ax=ax2)
    ax2.set_title('Confusion Matrix (Normalized)', fontweight='bold', fontsize=14)
    ax2.set_xlabel('Predicted Label')
    ax2.set_ylabel('True Label')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed confusion matrix analysis
    print("\n" + "="*60)
    print("CONFUSION MATRIX ANALYSIS")
    print("="*60)
    
    print(f"\nClass-wise Performance:")
    print("-" * 40)
    
    for i, class_name in enumerate(class_names):
        true_positives = cm[i, i]
        false_positives = cm[:, i].sum() - true_positives
        false_negatives = cm[i, :].sum() - true_positives
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"{class_name:>10}: Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}")
    
    return cm

cm = plot_confusion_matrix()

In [None]:
# Analyze prediction confidence
def analyze_prediction_confidence():
    print("\n" + "="*60)
    print("PREDICTION CONFIDENCE ANALYSIS")
    print("="*60)
    
    all_confidences = []
    correct_confidences = []
    incorrect_confidences = []
    
    current_idx = 0
    for images, labels in test_dataset_filtered:
        predictions = model.predict(images, verbose=0)
        predicted_classes = np.argmax(predictions, axis=1)
        max_confidences = np.max(predictions, axis=1)
        
        for i in range(len(images)):
            confidence = max_confidences[i]
            all_confidences.append(confidence)
            
            if predicted_classes[i] == labels[i].numpy():
                correct_confidences.append(confidence)
            else:
                incorrect_confidences.append(confidence)
            
            current_idx += 1
    
    # Plot confidence distributions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram of all confidences
    ax1.hist(all_confidences, bins=30, alpha=0.7, color='blue', label='All predictions')
    ax1.hist(correct_confidences, bins=30, alpha=0.7, color='green', label='Correct predictions')
    ax1.hist(incorrect_confidences, bins=30, alpha=0.7, color='red', label='Incorrect predictions')
    ax1.set_xlabel('Prediction Confidence')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Distribution of Prediction Confidences')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Box plot comparison
    data_to_plot = [correct_confidences, incorrect_confidences]
    labels_to_plot = ['Correct\nPredictions', 'Incorrect\nPredictions']
    
    bp = ax2.boxplot(data_to_plot, labels=labels_to_plot, patch_artist=True)
    bp['boxes'][0].set_facecolor('lightgreen')
    bp['boxes'][1].set_facecolor('lightcoral')
    ax2.set_ylabel('Prediction Confidence')
    ax2.set_title('Confidence: Correct vs Incorrect Predictions')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Average confidence (all): {np.mean(all_confidences):.3f}")
    print(f"Average confidence (correct): {np.mean(correct_confidences):.3f}")
    print(f"Average confidence (incorrect): {np.mean(incorrect_confidences):.3f}")
    print(f"Standard deviation (all): {np.std(all_confidences):.3f}")
    
    # Low confidence predictions
    low_confidence_threshold = 0.6
    low_confidence_count = sum(1 for conf in all_confidences if conf < low_confidence_threshold)
    print(f"\nPredictions with confidence < {low_confidence_threshold}: {low_confidence_count} ({low_confidence_count/len(all_confidences)*100:.1f}%)")

analyze_prediction_confidence()

# Model Evaluation Summary

## Key Findings:

### Performance Metrics:
- **Overall Accuracy**: Check the accuracy score above
- **Class-wise Performance**: Review the classification report for detailed metrics per emotion

### Confusion Matrix Insights:
- **Diagonal elements**: Show correct predictions for each class
- **Off-diagonal elements**: Indicate misclassifications between emotions
- **Common confusions**: Look for high values in off-diagonal cells to identify which emotions are commonly confused

### Prediction Confidence:
- **High confidence correct predictions**: Indicate the model is certain and accurate
- **Low confidence predictions**: May require additional training or data
- **Confidence distribution**: Shows how certain the model is across all predictions

### Potential Issues to Address:
1. **Class imbalance**: Some emotions may have fewer samples
2. **Difficult distinctions**: Similar emotions (e.g., sad vs neutral) may be harder to distinguish
3. **Model bias**: Check if the model consistently predicts certain emotions more often

### Next Steps:
- If accuracy is low, consider data augmentation or model architecture improvements
- For class imbalance, implement better class weighting or data balancing techniques
- For specific emotion confusions, analyze those image samples more closely