# üèÜ Transformer Models Comparison: BERT vs RoBERTa vs ELECTRA

This notebook compares the performance of three state-of-the-art transformer models on the emotion classification test dataset:
- **BERT** (bert-base-uncased) - 110M parameters
- **RoBERTa** (roberta-base) - 125M parameters  
- **ELECTRA** (electra-base-discriminator) - 110M parameters


## üìö Import Libraries


In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Transformer imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import time


## üìÇ Load Test Data

**Note:** Make sure you've run `01_preprocessing.ipynb` with `split='test'` to generate `test_preprocessed.pkl`.


In [None]:
# Load preprocessed test data
test_df = pd.read_pickle('./data/test_preprocessed.pkl')

print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(test_df.head())

# Prepare data
X_test = test_df['Text']
y_test = test_df['Label']

# Emotion labels
emotion_labels = ['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise']

print(f"\nTest samples: {len(X_test)}")
print(f"\nLabel distribution in test set:")
print(y_test.value_counts().sort_index())


## üîÑ Load Transformer Models


In [None]:
# Model configurations
models_config = {
    'BERT': './data/bert/final_model',
    'RoBERTa': './data/roberta/final_model',
    'ELECTRA': './data/electra/final_model'
}

# Load all models and tokenizers
models = {}
tokenizers = {}
metadata = {}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}\n")

for model_name, model_path in models_config.items():
    print(f"Loading {model_name} model...")
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to(device)
    model.eval()
    models[model_name] = model
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizers[model_name] = tokenizer
    
    # Load metadata
    metadata_path = f'./data/{model_name.lower()}/{model_name.lower()}_metadata.pkl'
    with open(metadata_path, 'rb') as f:
        metadata[model_name] = pickle.load(f)
    
    print(f"   ‚úÖ {model_name} loaded successfully")
    print(f"      Training validation accuracy: {metadata[model_name]['val_accuracy']:.4f}")
    print(f"      Parameters: {metadata[model_name]['num_parameters']:,}")
    print(f"      Size: {metadata[model_name]['model_size_mb']:.2f} MB\n")


## üéØ Evaluate Models on Test Set


In [None]:
# Function to evaluate a model
def evaluate_model(model, tokenizer, texts, labels, model_name, max_length=128, batch_size=32):
    """Evaluate a transformer model on test data"""
    
    print(f"Evaluating {model_name}...")
    start_time = time.time()
    
    # Tokenize data
    encodings = tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Create dataset
    dataset = torch.utils.data.TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask']
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
    # Get predictions
    all_predictions = []
    model.eval()
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
    
    inference_time = time.time() - start_time
    
    # Calculate metrics
    y_pred = np.array(all_predictions)
    accuracy = accuracy_score(labels, y_pred)
    
    print(f"   ‚úÖ {model_name} evaluation complete")
    print(f"      Accuracy: {accuracy:.4f}")
    print(f"      Inference time: {inference_time:.2f} seconds\n")
    
    return {
        'predictions': y_pred,
        'accuracy': accuracy,
        'inference_time': inference_time
    }

# Evaluate all models
results = {}

for model_name in models_config.keys():
    results[model_name] = evaluate_model(
        models[model_name],
        tokenizers[model_name],
        X_test,
        y_test,
        model_name
    )


In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'Inference Time (s)': [results[m]['inference_time'] for m in results.keys()],
    'Parameters (M)': [metadata[m]['num_parameters']/1e6 for m in results.keys()],
    'Model Size (MB)': [metadata[m]['model_size_mb'] for m in results.keys()]
})

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Accuracy comparison
ax1 = axes[0, 0]
bars1 = ax1.bar(comparison_df['Model'], comparison_df['Test Accuracy'], 
                color=['#3498db', '#2ecc71', '#9b59b6'], edgecolor='black', linewidth=2)
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 1])
ax1.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(comparison_df['Test Accuracy']):
    ax1.text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# Inference time comparison
ax2 = axes[0, 1]
bars2 = ax2.bar(comparison_df['Model'], comparison_df['Inference Time (s)'], 
                color=['#e74c3c', '#f39c12', '#e67e22'], edgecolor='black', linewidth=2)
ax2.set_ylabel('Time (seconds)', fontsize=12)
ax2.set_title('Inference Time Comparison', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(comparison_df['Inference Time (s)']):
    ax2.text(i, v + 0.5, f'{v:.2f}s', ha='center', fontweight='bold')

# Model size comparison
ax3 = axes[1, 0]
bars3 = ax3.bar(comparison_df['Model'], comparison_df['Model Size (MB)'], 
                color=['#16a085', '#27ae60', '#2980b9'], edgecolor='black', linewidth=2)
ax3.set_ylabel('Size (MB)', fontsize=12)
ax3.set_title('Model Size Comparison', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(comparison_df['Model Size (MB)']):
    ax3.text(i, v + 5, f'{v:.1f} MB', ha='center', fontweight='bold')

# Parameters comparison
ax4 = axes[1, 1]
bars4 = ax4.bar(comparison_df['Model'], comparison_df['Parameters (M)'], 
                color=['#8e44ad', '#c0392b', '#d35400'], edgecolor='black', linewidth=2)
ax4.set_ylabel('Parameters (millions)', fontsize=12)
ax4.set_title('Model Parameters Comparison', fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(comparison_df['Parameters (M)']):
    ax4.text(i, v + 2, f'{v:.1f}M', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## üéØ Confusion Matrices - Side by Side


In [None]:
# Create confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(24, 7))

colors = ['Blues', 'Greens', 'Purples']

for idx, (model_name, color) in enumerate(zip(results.keys(), colors)):
    cm = confusion_matrix(y_test, results[model_name]['predictions'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap=color, ax=axes[idx],
                xticklabels=emotion_labels, yticklabels=emotion_labels)
    axes[idx].set_xlabel('Predicted Label', fontsize=11)
    axes[idx].set_ylabel('True Label', fontsize=11)
    axes[idx].set_title(f'{model_name} Model - Confusion Matrix\nAccuracy: {results[model_name]["accuracy"]:.4f}', 
                       fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()


## üìù Classification Reports


In [None]:
# Print classification reports for all models
for model_name in results.keys():
    print("\n" + "="*60)
    print(f"{model_name} MODEL - CLASSIFICATION REPORT")
    print("="*60)
    print(classification_report(y_test, results[model_name]['predictions'], 
                               target_names=emotion_labels))


## üìä Per-Class Performance Comparison


In [None]:
# Get per-class metrics for all models
metrics_by_model = {}

for model_name in results.keys():
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, 
        results[model_name]['predictions'], 
        average=None, 
        labels=list(range(6))
    )
    metrics_by_model[model_name] = {'precision': precision, 'recall': recall, 'f1': f1}

# Create comparison dataframe
emotion_comparison = pd.DataFrame({
    'Emotion': emotion_labels,
    'BERT F1': metrics_by_model['BERT']['f1'],
    'RoBERTa F1': metrics_by_model['RoBERTa']['f1'],
    'ELECTRA F1': metrics_by_model['ELECTRA']['f1']
})

print("\nPer-Class F1-Score Comparison:")
print(emotion_comparison.to_string(index=False))

# Visualize F1-scores comparison
fig, ax = plt.subplots(figsize=(14, 7))

x = np.arange(len(emotion_labels))
width = 0.25

bars1 = ax.bar(x - width, metrics_by_model['BERT']['f1'], width, 
               label='BERT', color='#3498db', edgecolor='black')
bars2 = ax.bar(x, metrics_by_model['RoBERTa']['f1'], width, 
               label='RoBERTa', color='#2ecc71', edgecolor='black')
bars3 = ax.bar(x + width, metrics_by_model['ELECTRA']['f1'], width, 
               label='ELECTRA', color='#9b59b6', edgecolor='black')

ax.set_xlabel('Emotion', fontsize=12)
ax.set_ylabel('F1-Score', fontsize=12)
ax.set_title('F1-Score Comparison by Emotion Class', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(emotion_labels)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()


## üèÜ Final Verdict


In [None]:
# Determine best model
best_model = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_accuracy = results[best_model]['accuracy']

print("\n" + "="*80)
print("üèÜ FINAL VERDICT")
print("="*80)
print(f"\nü•á Best Model: {best_model}")
print(f"   Test Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"   Inference Time: {results[best_model]['inference_time']:.2f} seconds")
print(f"   Model Size: {metadata[best_model]['model_size_mb']:.2f} MB")
print(f"   Parameters: {metadata[best_model]['num_parameters']:,}")

print("\nüìä All Models Summary:")
for model_name in results.keys():
    icon = "ü•á" if model_name == best_model else "  "
    print(f"{icon} {model_name:10s}: {results[model_name]['accuracy']:.4f} accuracy | "
          f"{results[model_name]['inference_time']:.2f}s inference | "
          f"{metadata[model_name]['model_size_mb']:.1f} MB")

print("\nüí° Key Insights:")
print("   - All transformer models significantly outperform traditional RNN architectures")
print("   - ELECTRA offers excellent sample efficiency with discriminative pre-training")
print("   - RoBERTa benefits from improved pre-training over BERT")
print("   - Model size and inference time are comparable across all three models")
print("="*80)


## üíæ Save Predictions

Save predictions from the best model to CSV for submission.


In [None]:
# Create predictions DataFrame for best model
predictions_df = pd.DataFrame({
    'text': X_test.values,
    'true_label': y_test.values,
    'predicted_label': results[best_model]['predictions'],
    'true_emotion': [emotion_labels[i] for i in y_test.values],
    'predicted_emotion': [emotion_labels[i] for i in results[best_model]['predictions']]
})

# Save to CSV
output_path = f'./data/test_predictions_{best_model.lower()}.csv'
predictions_df.to_csv(output_path, index=False)
print(f"\n‚úÖ Predictions saved to: {output_path}")
print(f"   Total predictions: {len(predictions_df)}")
print(f"   Accuracy: {best_accuracy:.4f}")

# Show sample predictions
print(f"\nSample predictions:")
print(predictions_df[['text', 'true_emotion', 'predicted_emotion']].head(10).to_string(index=False))
