# üèÜ Model Comparison: GRU vs BiLSTM

This notebook compares the performance of GRU and BiLSTM models on the test dataset.


## üìö Import Libraries


In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')


## üìÇ Load Test Data

First, we need to preprocess the test data using the preprocessing notebook.


In [None]:
# Load preprocessed test data
# NOTE: Before running this, make sure you've run preprocessing.ipynb with split='test'
test_df = pd.read_pickle('./data/test_preprocessed.pkl')

print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(test_df.head())


## üìä Prepare Data

Split the data into features (X) and labels (y).

In [None]:
# Prepare X and y
X_test = test_df['Text']
y_test = test_df['Label']

print(f"Test samples: {len(X_test)}")
print(f"\nLabel distribution in test set:")
print(y_test.value_counts().sort_index())


## üîÑ Load Models and Tokenizers


In [None]:
# Load GRU model and assets
print("Loading GRU model...")
gru_model = load_model('./data/gru/gru_model.keras')

with open('./data/gru/gru_tokenizer.pkl', 'rb') as f:
    gru_tokenizer = pickle.load(f)

with open('./data/gru/gru_metadata.pkl', 'rb') as f:
    gru_metadata = pickle.load(f)

print(f"‚úÖ GRU model loaded successfully")
print(f"   Training validation accuracy: {gru_metadata['val_accuracy']:.4f}")

# Load BiLSTM model and assets
print("\nLoading BiLSTM model...")
lstm_model = load_model('./data/lstm/lstm_model.keras')

with open('./data/lstm/lstm_tokenizer.pkl', 'rb') as f:
    lstm_tokenizer = pickle.load(f)

with open('./data/lstm/lstm_metadata.pkl', 'rb') as f:
    lstm_metadata = pickle.load(f)

print(f"‚úÖ BiLSTM model loaded successfully")
print(f"   Training validation accuracy: {lstm_metadata['val_accuracy']:.4f}")


## üî† Prepare Test Data for GRU Model


In [None]:
# Tokenize and pad for GRU model
X_test_gru_sequences = gru_tokenizer.texts_to_sequences(X_test)
X_test_gru_padded = pad_sequences(X_test_gru_sequences, 
                                   maxlen=gru_metadata['maxlen'], 
                                   padding='post')

print(f"GRU test data shape: {X_test_gru_padded.shape}")


## üî† Prepare Test Data for BiLSTM Model


In [None]:
# Tokenize and pad for BiLSTM model
X_test_lstm_sequences = lstm_tokenizer.texts_to_sequences(X_test)
X_test_lstm_padded = pad_sequences(X_test_lstm_sequences, 
                                    maxlen=lstm_metadata['maxlen'], 
                                    padding='post')

print(f"BiLSTM test data shape: {X_test_lstm_padded.shape}")


## üéØ Evaluate GRU Model on Test Set


In [None]:
# Evaluate GRU model
gru_test_loss, gru_test_accuracy = gru_model.evaluate(X_test_gru_padded, y_test, verbose=0)

# Get predictions
y_pred_gru = gru_model.predict(X_test_gru_padded, verbose=0)
y_pred_gru = np.argmax(y_pred_gru, axis=1)

print(f"GRU Model Test Results:")
print(f"  Test Loss: {gru_test_loss:.4f}")
print(f"  Test Accuracy: {gru_test_accuracy:.4f}")


## üéØ Evaluate BiLSTM Model on Test Set


In [None]:
# Evaluate BiLSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_lstm_padded, y_test, verbose=0)

# Get predictions
y_pred_lstm = lstm_model.predict(X_test_lstm_padded, verbose=0)
y_pred_lstm = np.argmax(y_pred_lstm, axis=1)

print(f"BiLSTM Model Test Results:")
print(f"  Test Loss: {lstm_test_loss:.4f}")
print(f"  Test Accuracy: {lstm_test_accuracy:.4f}")


## üìä Compare Model Performance


In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GRU', 'BiLSTM'],
    'Test Accuracy': [gru_test_accuracy, lstm_test_accuracy],
    'Test Loss': [gru_test_loss, lstm_test_loss]
})

print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60)

# Visualize comparison
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
axs[0].bar(comparison_df['Model'], comparison_df['Test Accuracy'], 
           color=['#3498db', '#2ecc71'], edgecolor='black', linewidth=2)
axs[0].set_ylabel('Accuracy')
axs[0].set_title('Test Accuracy Comparison')
axs[0].set_ylim([0, 1])
axs[0].grid(True, alpha=0.3)
for i, v in enumerate(comparison_df['Test Accuracy']):
    axs[0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# Loss comparison
axs[1].bar(comparison_df['Model'], comparison_df['Test Loss'], 
           color=['#e74c3c', '#f39c12'], edgecolor='black', linewidth=2)
axs[1].set_ylabel('Loss')
axs[1].set_title('Test Loss Comparison')
axs[1].grid(True, alpha=0.3)
for i, v in enumerate(comparison_df['Test Loss']):
    axs[1].text(i, v + 0.01, f'{v:.4f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## üéØ Confusion Matrices - Side by Side


In [None]:
# Create confusion matrices
cm_gru = confusion_matrix(y_test, y_pred_gru)
cm_lstm = confusion_matrix(y_test, y_pred_lstm)

# Plot confusion matrices side by side
fig, axs = plt.subplots(1, 2, figsize=(18, 7))

# GRU confusion matrix
sns.heatmap(cm_gru, annot=True, fmt='d', cmap='Blues', ax=axs[0],
            xticklabels=emotion_labels, yticklabels=emotion_labels)
axs[0].set_xlabel('Predicted labels')
axs[0].set_ylabel('True labels')
axs[0].set_title(f'GRU Model - Confusion Matrix\nAccuracy: {gru_test_accuracy:.4f}')

# BiLSTM confusion matrix
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Greens', ax=axs[1],
            xticklabels=emotion_labels, yticklabels=emotion_labels)
axs[1].set_xlabel('Predicted labels')
axs[1].set_ylabel('True labels')
axs[1].set_title(f'BiLSTM Model - Confusion Matrix\nAccuracy: {lstm_test_accuracy:.4f}')

plt.tight_layout()
plt.show()


## üìù Classification Reports


In [None]:
# GRU Classification Report
print("\n" + "="*60)
print("GRU MODEL - CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred_gru, target_names=emotion_labels))

# BiLSTM Classification Report
print("\n" + "="*60)
print("BiLSTM MODEL - CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred_lstm, target_names=emotion_labels))


## üìä Per-Class Performance Comparison


In [None]:
# Get classification reports as dictionaries
from sklearn.metrics import precision_recall_fscore_support

gru_precision, gru_recall, gru_f1, _ = precision_recall_fscore_support(
    y_test, y_pred_gru, average=None, labels=list(range(6))
)

lstm_precision, lstm_recall, lstm_f1, _ = precision_recall_fscore_support(
    y_test, y_pred_lstm, average=None, labels=list(range(6))
)

# Create comparison dataframe
emotion_comparison = pd.DataFrame({
    'Emotion': emotion_labels,
    'GRU Precision': gru_precision,
    'LSTM Precision': lstm_precision,
    'GRU Recall': gru_recall,
    'LSTM Recall': lstm_recall,
    'GRU F1': gru_f1,
    'LSTM F1': lstm_f1
})

print("\nPer-Class Performance Comparison:")
print(emotion_comparison.to_string(index=False))

# Visualize F1-scores comparison
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(emotion_labels))
width = 0.35

bars1 = ax.bar(x - width/2, gru_f1, width, label='GRU', color='#3498db', edgecolor='black')
bars2 = ax.bar(x + width/2, lstm_f1, width, label='BiLSTM', color='#2ecc71', edgecolor='black')

ax.set_xlabel('Emotion')
ax.set_ylabel('F1-Score')
ax.set_title('F1-Score Comparison by Emotion Class')
ax.set_xticks(x)
ax.set_xticklabels(emotion_labels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()


## üèÜ Final Verdict


In [None]:
# Determine winner
if gru_test_accuracy > lstm_test_accuracy:
    winner = "GRU"
    winner_acc = gru_test_accuracy
    diff = gru_test_accuracy - lstm_test_accuracy
elif lstm_test_accuracy > gru_test_accuracy:
    winner = "BiLSTM"
    winner_acc = lstm_test_accuracy
    diff = lstm_test_accuracy - gru_test_accuracy
else:
    winner = "TIE"
    winner_acc = gru_test_accuracy
    diff = 0

print("\n" + "="*60)
print("üèÜ FINAL VERDICT")
print("="*60)

if winner == "TIE":
    print("Both models performed equally well!")
else:
    print(f"Winner: {winner} Model")
    print(f"  Accuracy: {winner_acc:.4f}")
    print(f"  Margin: +{diff:.4f} ({diff*100:.2f}%)")

print("\nüìä Summary:")
print(f"  GRU Model:    {gru_test_accuracy:.4f} accuracy")
print(f"  BiLSTM Model: {lstm_test_accuracy:.4f} accuracy")
print("="*60)
