In [None]:
# Cell 1: Import Libraries and Dependencies
import os
import torch
import numpy as np
import pandas as pd
from PIL import Image
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, average_precision_score
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForImageClassification, AutoImageProcessor
from torch import nn
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


In [None]:
# Cell 2: Dataset Class Definition
class EMOTICDataset(torch.utils.data.Dataset):
    def __init__(self, annotations, img_dir, feature_extractor, num_categories=26):
        self.annotations = annotations
        self.img_dir = img_dir
        self.feature_extractor = feature_extractor
        self.num_categories = num_categories

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        entry = self.annotations[idx]
        img_path = os.path.join(self.img_dir, entry['filename'])

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")

        # Load the image and ensure it's RGB
        image = np.load(img_path)
        if len(image.shape) == 2:  # Grayscale image
            image = np.stack([image] * 3, axis=-1)

        # Preprocess the image
        inputs = self.feature_extractor(images=image, return_tensors="pt", antialias=True)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}

        # Multi-hot encoding for labels
        categories = torch.zeros(self.num_categories, dtype=torch.float32)
        for category in entry['categories']:
            if category < self.num_categories:
                categories[category] = 1.0

        inputs["labels"] = categories
        return inputs

def parse_annotations(csv_path):
    """Parse annotations from CSV file"""
    df = pd.read_csv(csv_path)
    category_columns = df.columns[8:34]

    # Calculate class counts
    class_counts = df[category_columns].sum().to_numpy(dtype=np.float32)

    # Parse annotations
    annotations = []
    for _, row in df.iterrows():
        categories = [int(idx) for idx, val in enumerate(row[category_columns]) if val == 1]
        annotation = {"filename": row["Crop_name"], "categories": categories}
        annotations.append(annotation)

    return annotations, class_counts

print("Dataset classes defined successfully!")


In [None]:
# Cell 3: Configuration and Paths Setup
# UPDATE THESE PATHS TO MATCH YOUR SETUP
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "best_vit_emotic.pth"
val_annotations_path = "/content/emotic_data/annots_arrs/annot_arrs_val.csv"
test_annotations_path = "/content/emotic_data/annots_arrs/annot_arrs_val.csv"  # Using val as test for now
img_dir = "/content/emotic_data/img_arrs/"

# Parameters
batch_size = 32
num_classes = 26
emotion_labels = [
    'Peace', 'Affection', 'Esteem', 'Anticipation', 'Engagement',
    'Confidence', 'Happiness', 'Pleasure', 'Excitement', 'Surprise',
    'Sympathy', 'Doubt/Confusion', 'Disconnection', 'Fatigue',
    'Embarrassment', 'Yearning', 'Disapproval', 'Aversion',
    'Annoyance', 'Anger', 'Sensitivity', 'Sadness',
    'Disquietment', 'Fear', 'Pain', 'Suffering'
]

print(f"Configuration set:")
print(f"Device: {device}")
print(f"Model path: {model_path}")
print(f"Number of classes: {num_classes}")
print(f"Batch size: {batch_size}")


In [None]:
# Cell 4: Model Loading
def load_model(model_path, device, num_classes=26):
    """Load the trained model"""
    model = AutoModelForImageClassification.from_pretrained(
        "google/vit-base-patch16-224",
        ignore_mismatched_sizes=True
    ).to(device)

    # Recreate the classifier head
    model.classifier = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(model.config.hidden_size, num_classes)
    ).to(device)

    # Load trained weights
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location=device))
        print(f"✓ Model loaded successfully from {model_path}")
    else:
        print(f"⚠️  Warning: Model file {model_path} not found. Using randomly initialized model.")

    model.eval()
    return model

# Load the model
model = load_model(model_path, device, num_classes)
feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)

print("Model loaded and ready for testing!")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cell 5: Dataset Loading and Basic Info
# Load datasets
print("Loading datasets...")
val_annotations, val_class_counts = parse_annotations(val_annotations_path)
test_annotations, test_class_counts = parse_annotations(test_annotations_path)

val_dataset = EMOTICDataset(val_annotations, img_dir, feature_extractor, num_classes)
test_dataset = EMOTICDataset(test_annotations, img_dir, feature_extractor, num_classes)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f"✓ Validation set: {len(val_dataset)} samples")
print(f"✓ Test set: {len(test_dataset)} samples")

# Display class distribution
class_dist_df = pd.DataFrame({
    'Emotion': emotion_labels,
    'Val_Count': val_class_counts,
    'Test_Count': test_class_counts,
    'Val_Percentage': val_class_counts / len(val_dataset) * 100,
    'Test_Percentage': test_class_counts / len(test_dataset) * 100
})

print("\nClass Distribution:")
print(class_dist_df.head(10))


In [None]:
# Cell 6: Basic Model Prediction Test
def test_single_batch(model, data_loader, device):
    """Test model on a single batch to verify it's working"""
    model.eval()

    # Get one batch
    batch = next(iter(data_loader))
    images = batch["pixel_values"].to(device)
    labels = batch["labels"]

    print(f"Batch shape: {images.shape}")
    print(f"Labels shape: {labels.shape}")

    with torch.no_grad():
        outputs = model(images)
        logits = outputs.logits
        probs = torch.sigmoid(logits)

    print(f"Logits shape: {logits.shape}")
    print(f"Logits range: [{logits.min().item():.4f}, {logits.max().item():.4f}]")
    print(f"Probabilities range: [{probs.min().item():.4f}, {probs.max().item():.4f}]")
    print(f"Mean probability: {probs.mean().item():.4f}")

    # Show predictions for first sample
    first_sample_probs = probs[0].cpu().numpy()
    first_sample_labels = labels[0].numpy()

    print(f"\nFirst sample predictions:")
    print(f"True labels: {np.where(first_sample_labels == 1)[0]}")
    print(f"Top 5 predicted emotions (with probabilities):")
    top_indices = np.argsort(first_sample_probs)[-5:][::-1]
    for idx in top_indices:
        print(f"  {emotion_labels[idx]}: {first_sample_probs[idx]:.4f}")

    return logits.cpu().numpy(), probs.cpu().numpy(), labels.numpy()

print("Testing model on a single batch...")
test_logits, test_probs, test_labels = test_single_batch(model, test_loader, device)


In [None]:
# Cell 7: Get All Predictions and Targets
def get_predictions_and_targets(model, data_loader, device):
    """Get model predictions and ground truth targets for entire dataset"""
    all_targets = []
    all_probs = []
    all_logits = []

    model.eval()
    print("Getting predictions for entire dataset...")

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            if i % 10 == 0:
                print(f"Processing batch {i+1}/{len(data_loader)}")

            images = batch["pixel_values"].to(device)
            labels = batch["labels"]

            # Get model predictions
            outputs = model(images)
            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()

            all_targets.extend(labels.numpy())
            all_probs.extend(probs)
            all_logits.extend(logits.cpu().numpy())

    all_targets = np.array(all_targets)
    all_probs = np.array(all_probs)
    all_logits = np.array(all_logits)

    print(f"✓ Collected predictions from {len(all_targets)} samples")
    print(f"Targets shape: {all_targets.shape}")
    print(f"Probabilities shape: {all_probs.shape}")

    return all_targets, all_probs, all_logits

# Get predictions for validation set (for threshold calculation)
val_targets, val_probs, val_logits = get_predictions_and_targets(model, val_loader, device)

# Get predictions for test set (for final evaluation)
test_targets, test_probs, test_logits = get_predictions_and_targets(model, test_loader, device)


In [None]:
# Cell 8: Threshold Optimization
def calculate_optimal_thresholds(targets, probs, emotion_labels, threshold_range=(0.001, 0.5, 0.01)):
    """Calculate optimal thresholds for each class based on F1 score"""
    print("Calculating optimal thresholds for each emotion class...")

    optimal_thresholds = []
    threshold_details = []

    for class_idx in range(len(emotion_labels)):
        if class_idx % 5 == 0:
            print(f"Processing classes {class_idx}-{min(class_idx+4, len(emotion_labels)-1)}...")

        best_threshold = 0.1
        best_f1 = 0.0
        class_details = []

        # Skip classes with no positive samples
        if targets[:, class_idx].sum() == 0:
            print(f"⚠️  {emotion_labels[class_idx]}: No positive samples, using default threshold")
            optimal_thresholds.append(0.1)
            threshold_details.append({
                'class': emotion_labels[class_idx],
                'best_threshold': 0.1,
                'best_f1': 0.0,
                'num_positives': 0
            })
            continue

        for threshold in np.arange(*threshold_range):
            preds = (probs[:, class_idx] > threshold).astype(int)

            # Skip if no predictions made
            if preds.sum() == 0:
                continue

            precision, recall, f1, _ = precision_recall_fscore_support(
                targets[:, class_idx], preds, average='binary', zero_division=0
            )

            class_details.append({
                'threshold': threshold,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        optimal_thresholds.append(best_threshold)
        threshold_details.append({
            'class': emotion_labels[class_idx],
            'best_threshold': best_threshold,
            'best_f1': best_f1,
            'num_positives': int(targets[:, class_idx].sum()),
            'details': class_details
        })

    return np.array(optimal_thresholds), threshold_details

# Calculate optimal thresholds using validation set
optimal_thresholds, threshold_details = calculate_optimal_thresholds(
    val_targets, val_probs, emotion_labels
)

# Display threshold results
print("\nOptimal Thresholds Summary:")
print("-" * 60)
for i, detail in enumerate(threshold_details):
    print(f"{detail['class']:<20}: threshold={detail['best_threshold']:.4f}, "
          f"F1={detail['best_f1']:.4f}, positives={detail['num_positives']}")


In [None]:
# Cell 9: mAP Calculation
def calculate_map(targets, probs, emotion_labels):
    """Calculate mean Average Precision (mAP)"""
    print("Calculating mean Average Precision (mAP)...")

    ap_scores = []
    valid_classes = 0

    for class_idx in range(len(emotion_labels)):
        # Skip classes with no positive samples
        if targets[:, class_idx].sum() == 0:
            print(f"⚠️  {emotion_labels[class_idx]}: No positive samples, AP = 0")
            ap_scores.append(0.0)
            continue

        ap = average_precision_score(targets[:, class_idx], probs[:, class_idx])
        ap_scores.append(ap)
        valid_classes += 1

        if class_idx < 5:  # Show first 5 for debugging
            print(f"✓ {emotion_labels[class_idx]}: AP = {ap:.4f}")

    map_score = np.mean(ap_scores)
    print(f"\n✓ Mean Average Precision (mAP): {map_score:.4f}")
    print(f"✓ Valid classes for mAP calculation: {valid_classes}/{len(emotion_labels)}")

    return map_score, ap_scores

# Calculate mAP on test set
test_map_score, test_ap_scores = calculate_map(test_targets, test_probs, emotion_labels)


In [None]:
# Cell 10: F1 Score Calculation
def calculate_f1_scores(targets, probs, thresholds, emotion_labels):
    """Calculate F1 scores using optimal thresholds"""
    print("Calculating F1 scores with optimal thresholds...")

    # Apply thresholds
    predictions = np.zeros_like(probs)
    for class_idx in range(len(emotion_labels)):
        predictions[:, class_idx] = (probs[:, class_idx] > thresholds[class_idx]).astype(int)

    # Calculate per-class metrics
    f1_scores = []
    precision_scores = []
    recall_scores = []

    for class_idx in range(len(emotion_labels)):
        precision, recall, f1, _ = precision_recall_fscore_support(
            targets[:, class_idx], predictions[:, class_idx],
            average='binary', zero_division=0
        )
        f1_scores.append(f1)
        precision_scores.append(precision)
        recall_scores.append(recall)

    # Calculate macro averages
    macro_f1 = np.mean(f1_scores)
    macro_precision = np.mean(precision_scores)
    macro_recall = np.mean(recall_scores)

    # Calculate exact match accuracy
    exact_match_accuracy = np.mean(np.all(targets == predictions, axis=1))

    print(f"✓ Macro F1 Score: {macro_f1:.4f}")
    print(f"✓ Macro Precision: {macro_precision:.4f}")
    print(f"✓ Macro Recall: {macro_recall:.4f}")
    print(f"✓ Exact Match Accuracy: {exact_match_accuracy:.4f}")

    return {
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'exact_match_accuracy': exact_match_accuracy,
        'per_class_f1': f1_scores,
        'per_class_precision': precision_scores,
        'per_class_recall': recall_scores,
        'predictions': predictions
    }

# Calculate F1 scores on test set
f1_results = calculate_f1_scores(test_targets, test_probs, optimal_thresholds, emotion_labels)


In [None]:
# Cell 11: Results Summary and Per-Class Analysis
print("="*80)
print("FINAL MODEL EVALUATION RESULTS")
print("="*80)
print(f"Mean Average Precision (mAP): {test_map_score:.4f}")
print(f"Macro F1 Score: {f1_results['macro_f1']:.4f}")
print(f"Macro Precision: {f1_results['macro_precision']:.4f}")
print(f"Macro Recall: {f1_results['macro_recall']:.4f}")
print(f"Exact Match Accuracy: {f1_results['exact_match_accuracy']:.4f}")

# Create detailed results DataFrame
results_df = pd.DataFrame({
    'Emotion': emotion_labels,
    'Average_Precision': test_ap_scores,
    'F1_Score': f1_results['per_class_f1'],
    'Precision': f1_results['per_class_precision'],
    'Recall': f1_results['per_class_recall'],
    'Optimal_Threshold': optimal_thresholds,
    'Num_Positives': [int(test_targets[:, i].sum()) for i in range(num_classes)]
})

print(f"\nDetailed Per-Class Results:")
print("-" * 100)
print(results_df.to_string(index=False, float_format='%.4f'))

# Save results
results_df.to_csv('detailed_model_results.csv', index=False)
print(f"\n✓ Detailed results saved to 'detailed_model_results.csv'")


In [None]:
# Cell 12: Visualizations
def create_evaluation_plots(results_df, test_map_score, macro_f1):
    """Create comprehensive evaluation plots"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Plot 1: Per-class AP scores
    axes[0, 0].barh(range(len(emotion_labels)), results_df['Average_Precision'], color='skyblue')
    axes[0, 0].set_yticks(range(len(emotion_labels)))
    axes[0, 0].set_yticklabels(emotion_labels, fontsize=8)
    axes[0, 0].set_xlabel('Average Precision')
    axes[0, 0].set_title(f'Per-Class Average Precision (mAP: {test_map_score:.4f})')
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].axvline(x=test_map_score, color='red', linestyle='--', alpha=0.7, label=f'mAP: {test_map_score:.4f}')
    axes[0, 0].legend()

    # Plot 2: Per-class F1 scores
    axes[0, 1].barh(range(len(emotion_labels)), results_df['F1_Score'], color='lightgreen')
    axes[0, 1].set_yticks(range(len(emotion_labels)))
    axes[0, 1].set_yticklabels(emotion_labels, fontsize=8)
    axes[0, 1].set_xlabel('F1 Score')
    axes[0, 1].set_title(f'Per-Class F1 Scores (Macro F1: {macro_f1:.4f})')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].axvline(x=macro_f1, color='red', linestyle='--', alpha=0.7, label=f'Macro F1: {macro_f1:.4f}')
    axes[0, 1].legend()

    # Plot 3: Optimal thresholds
    axes[1, 0].barh(range(len(emotion_labels)), results_df['Optimal_Threshold'], color='orange')
    axes[1, 0].set_yticks(range(len(emotion_labels)))
    axes[1, 0].set_yticklabels(emotion_labels, fontsize=8)
    axes[1, 0].set_xlabel('Optimal Threshold')
    axes[1, 0].set_title('Optimal Thresholds per Class')
    axes[1, 0].grid(True, alpha=0.3)

    # Plot 4: Precision vs Recall scatter
    scatter = axes[1, 1].scatter(results_df['Recall'], results_df['Precision'],
                                c=results_df['F1_Score'], cmap='viridis', s=60, alpha=0.7)
    axes[1, 1].set_xlabel('Recall')
    axes[1, 1].set_ylabel('Precision')
    axes[1, 1].set_title('Precision vs Recall per Class (colored by F1)')
    axes[1, 1].grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=axes[1, 1], label='F1 Score')

    # Add diagonal line for reference
    max_val = max(results_df['Recall'].max(), results_df['Precision'].max())
    axes[1, 1].plot([0, max_val], [0, max_val], 'k--', alpha=0.5)

    plt.tight_layout()
    plt.savefig('model_evaluation_results.png', dpi=300, bbox_inches='tight')
    print("✓ Evaluation plots saved to 'model_evaluation_results.png'")
    plt.show()

# Create and display plots
create_evaluation_plots(results_df, test_map_score, f1_results['macro_f1'])


In [None]:
# Cell 13: Additional Analysis - Top and Bottom Performing Classes
print("="*60)
print("TOP AND BOTTOM PERFORMING CLASSES")
print("="*60)

# Sort by F1 score
sorted_results = results_df.sort_values('F1_Score', ascending=False)

print("Top 5 performing classes (by F1 score):")
print("-" * 40)
top_5 = sorted_results.head(5)
for _, row in top_5.iterrows():
    print(f"{row['Emotion']:<20}: F1={row['F1_Score']:.4f}, AP={row['Average_Precision']:.4f}")

print("\nBottom 5 performing classes (by F1 score):")
print("-" * 40)
bottom_5 = sorted_results.tail(5)
for _, row in bottom_5.iterrows():
    print(f"{row['Emotion']:<20}: F1={row['F1_Score']:.4f}, AP={row['Average_Precision']:.4f}")

# Analyze class imbalance impact
print(f"\nClass Imbalance Analysis:")
print("-" * 40)
results_df['Imbalance_Ratio'] = results_df['Num_Positives'] / len(test_targets)
correlation_f1_balance = results_df[['F1_Score', 'Imbalance_Ratio']].corr().iloc[0, 1]
correlation_ap_balance = results_df[['Average_Precision', 'Imbalance_Ratio']].corr().iloc[0, 1]

print(f"Correlation between F1 and class balance: {correlation_f1_balance:.4f}")
print(f"Correlation between AP and class balance: {correlation_ap_balance:.4f}")


In [None]:
# Cell 14: Model Comparison with Different Thresholds
def compare_threshold_strategies(targets, probs, optimal_thresholds, emotion_labels):
    """Compare different threshold strategies"""
    print("Comparing different threshold strategies...")

    strategies = {
        'Fixed_0.5': np.full(len(emotion_labels), 0.5),
        'Fixed_0.3': np.full(len(emotion_labels), 0.3),
        'Fixed_0.1': np.full(len(emotion_labels), 0.1),
        'Optimal': optimal_thresholds
    }

    comparison_results = []

    for strategy_name, thresholds in strategies.items():
        results = calculate_f1_scores(targets, probs, thresholds, emotion_labels)
        comparison_results.append({
            'Strategy': strategy_name,
            'Macro_F1': results['macro_f1'],
            'Macro_Precision': results['macro_precision'],
            'Macro_Recall': results['macro_recall'],
            'Exact_Match_Acc': results['exact_match_accuracy']
        })

    comparison_df = pd.DataFrame(comparison_results)
    print("\nThreshold Strategy Comparison:")
    print("-" * 60)
    print(comparison_df.to_string(index=False, float_format='%.4f'))

    return comparison_df

# Compare threshold strategies
threshold_comparison = compare_threshold_strategies(
    test_targets, test_probs, optimal_thresholds, emotion_labels
)

print("\n" + "="*80)
print("EVALUATION COMPLETE!")
print("="*80)
print("Files saved:")
print("- detailed_model_results.csv: Per-class detailed results")
print("- model_evaluation_results.png: Visualization plots")
print("\nAll evaluation metrics have been calculated and saved.")