# Speech Emotion Recognition - SOTA Model Benchmark
## Using Existing Data Pipeline from Russian-Hacker.ipynb

This notebook benchmarks multiple SOTA models while keeping the **exact same** data loading and preprocessing pipeline.

In [None]:
# =============================================
# CELL 1: Imports and Configuration (SAME AS YOUR NOTEBOOK)
# =============================================
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# Configuration (SAME AS YOUR NOTEBOOK)
SUBESCO_PATH = 'H:/'
BANGLA_SER_PATH = 'H:/'
BANSPEMO_PATH = 'H:/A Bangla Language Emotional Speech Recognition Dataset/A Bangla Language Emotional Speech Recognition Dataset/Dataset'

EMOTIONS = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
SAMPLE_RATE = 16000
DURATION = 3
N_MFCC = 40
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 50

# Emotion mappings (SAME AS YOUR NOTEBOOK)
SUBESCO_EMOTIONS = {
    'ANGRY': 'Angry', 'DISGUST': 'Disgust', 'FEAR': 'Fear',
    'HAPPY': 'Happy', 'NEUTRAL': 'Neutral', 'SAD': 'Sad', 'SURPRISE': 'Surprise'
}

BANGLA_SER_EMOTIONS = {
    '01': 'Happy', '02': 'Sad', '03': 'Angry', '04': 'Surprise', '05': 'Neutral'
}

BANSPEMO_EMOTIONS = {
    '01': 'Angry', '02': 'Disgust', '03': 'Fear',
    '04': 'Happy', '05': 'Sad', '06': 'Surprise'
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# =============================================
# CELL 2: Feature Extraction Functions (SAME AS YOUR NOTEBOOK)
# =============================================
def extract_features(file_path):
    """Extract MFCC features - SAME as your notebook"""
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=N_MFCC)
        return mfcc.T
    except Exception as e:
        print(f'Error processing {file_path}: {e}')
        return None

def extract_raw_audio(file_path):
    """Extract raw audio for Wav2Vec2/HuBERT models"""
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
        # Pad or truncate to fixed length
        target_length = SAMPLE_RATE * DURATION
        if len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
        else:
            audio = audio[:target_length]
        return audio
    except Exception as e:
        print(f'Error processing {file_path}: {e}')
        return None

def get_subesco_emotion(filename):
    parts = filename.split('_')
    return SUBESCO_EMOTIONS.get(parts[5], None) if len(parts) > 5 else None

def get_bangla_ser_emotion(filename):
    parts = filename.split('-')
    return BANGLA_SER_EMOTIONS.get(parts[2], None) if len(parts) >= 3 else None

def get_banspemo_emotion(filename):
    parts = filename.split('_')
    return BANSPEMO_EMOTIONS.get(parts[-1].split('.')[0], None) if len(parts) >= 1 else None

In [None]:
# =============================================
# CELL 3: Data Loading (SAME AS YOUR NOTEBOOK)
# =============================================
def load_data(extract_raw=False):
    """Load data - SAME logic as your notebook, with option for raw audio"""
    features = []
    raw_audios = []
    labels = []
    
    print('Processing SUBESCO dataset...')
    for root, _, files in os.walk(SUBESCO_PATH):
        for filename in files:
            if filename.endswith('.wav'):
                emotion = get_subesco_emotion(filename)
                if emotion:
                    filepath = os.path.join(root, filename)
                    mfcc = extract_features(filepath)
                    if mfcc is not None:
                        features.append(mfcc)
                        labels.append(emotion)
                        if extract_raw:
                            raw = extract_raw_audio(filepath)
                            raw_audios.append(raw)
    
    print('Processing BANSPEMO dataset...')
    for root, _, files in os.walk(BANSPEMO_PATH):
        for filename in files:
            if filename.endswith('.wav'):
                emotion = get_banspemo_emotion(filename)
                if emotion:
                    filepath = os.path.join(root, filename)
                    mfcc = extract_features(filepath)
                    if mfcc is not None:
                        features.append(mfcc)
                        labels.append(emotion)
                        if extract_raw:
                            raw = extract_raw_audio(filepath)
                            raw_audios.append(raw)
    
    print('Processing BANGLA_SER dataset...')
    for actor_folder in os.listdir(BANGLA_SER_PATH):
        actor_path = os.path.join(BANGLA_SER_PATH, actor_folder)
        if not os.path.isdir(actor_path):
            continue
        for filename in os.listdir(actor_path):
            if filename.endswith('.wav'):
                emotion = get_bangla_ser_emotion(filename)
                if emotion:
                    filepath = os.path.join(actor_path, filename)
                    mfcc = extract_features(filepath)
                    if mfcc is not None:
                        features.append(mfcc)
                        labels.append(emotion)
                        if extract_raw:
                            raw = extract_raw_audio(filepath)
                            raw_audios.append(raw)
    
    # Pad sequences (SAME as your notebook)
    max_len = max(len(f) for f in features)
    features = np.array([np.pad(f, ((0, max_len - len(f)), (0, 0)), mode='constant') for f in features])
    
    # Encode labels (SAME as your notebook)
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    
    if extract_raw:
        raw_audios = np.array(raw_audios)
        return features, raw_audios, labels, le
    return features, labels, le

In [None]:
# =============================================
# CELL 4: Dataset Classes
# =============================================
class EmotionDataset(Dataset):
    """MFCC Dataset - SAME as your notebook"""
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label

class RawAudioDataset(Dataset):
    """Raw Audio Dataset for Wav2Vec2/HuBERT"""
    def __init__(self, raw_audios, labels):
        self.raw_audios = raw_audios
        self.labels = labels
    
    def __len__(self):
        return len(self.raw_audios)
    
    def __getitem__(self, idx):
        audio = torch.tensor(self.raw_audios[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return audio, label

In [None]:
# =============================================
# CELL 5: MODEL 1 - Your Original CNN-BiLSTM (SAME AS YOUR NOTEBOOK)
# =============================================
class EmotionModel(nn.Module):
    """Your original model - CNN + Bidirectional LSTM"""
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(EmotionModel, self).__init__()
        self.conv1 = nn.Conv1d(input_size, 64, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(2)
        self.dropout1 = nn.Dropout(0.3)
        
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(2)
        self.dropout2 = nn.Dropout(0.3)
        
        self.lstm1 = nn.LSTM(128, hidden_size, num_layers, batch_first=True, dropout=0.3, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_size*2, hidden_size, num_layers, batch_first=True, dropout=0.3, bidirectional=True)
        
        self.fc1 = nn.Linear(hidden_size*2, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)
    
    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.dropout1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool2(x)
        x = self.dropout2(x)
        
        x = x.permute(0, 2, 1)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

In [None]:
# =============================================
# CELL 6: MODEL 2 - CNN-Transformer Hybrid (FROM YOUR NOTEBOOK)
# =============================================
class CNNBackbones(nn.Module):
    def __init__(self):
        super(CNNBackbones, self).__init__()
        self.block1 = nn.Sequential(
            nn.Conv1d(N_MFCC, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.pool1 = nn.MaxPool1d(2)
        self.block2 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.pool2 = nn.AdaptiveMaxPool1d(31)
    
    def forward(self, x):
        x = self.block1(x)
        x = self.pool1(x)
        x = self.block2(x)
        x = self.pool2(x)
        return x

class CNNTransformerHybrid(nn.Module):
    """CNN-Transformer Hybrid from your notebook"""
    def __init__(self, num_classes=7):
        super(CNNTransformerHybrid, self).__init__()
        self.cnn = CNNBackbones()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=128, nhead=4, dim_feedforward=512, dropout=0.1, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.fc = nn.Sequential(
            nn.Linear(31 * 256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, features, time)
        cnn_out = self.cnn(x)
        transformer_in = cnn_out.permute(0, 2, 1)  # (batch, time, features)
        transformer_out = self.transformer(transformer_in)
        combined = torch.cat([cnn_out.permute(0, 2, 1), transformer_out], dim=-1)
        combined = combined.reshape(combined.size(0), -1)
        out = self.fc(combined)
        return out

In [None]:
# =============================================
# CELL 7: MODEL 3 - ECAPA-TDNN (SOTA for Speaker/Emotion Recognition)
# =============================================
class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block"""
    def __init__(self, channels, reduction=8):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1)
        return x * y.expand_as(x)

class Res2Block(nn.Module):
    """Res2Net-style block for ECAPA-TDNN"""
    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, scale=8):
        super(Res2Block, self).__init__()
        self.scale = scale
        width = out_channels // scale
        
        self.conv1 = nn.Conv1d(in_channels, out_channels, 1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        
        self.convs = nn.ModuleList([
            nn.Conv1d(width, width, kernel_size, dilation=dilation, padding=dilation*(kernel_size-1)//2)
            for _ in range(scale - 1)
        ])
        self.bns = nn.ModuleList([nn.BatchNorm1d(width) for _ in range(scale - 1)])
        
        self.conv3 = nn.Conv1d(out_channels, out_channels, 1)
        self.bn3 = nn.BatchNorm1d(out_channels)
        self.se = SEBlock(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, 1),
                nn.BatchNorm1d(out_channels)
            )
    
    def forward(self, x):
        residual = self.shortcut(x)
        
        out = self.conv1(x)
        out = self.relu(self.bn1(out))
        
        spx = torch.split(out, out.size(1) // self.scale, dim=1)
        sp_outs = [spx[0]]
        for i in range(1, self.scale):
            if i == 1:
                sp_out = spx[i]
            else:
                sp_out = sp_out + spx[i]
            sp_out = self.relu(self.bns[i-1](self.convs[i-1](sp_out)))
            sp_outs.append(sp_out)
        out = torch.cat(sp_outs, dim=1)
        
        out = self.conv3(out)
        out = self.bn3(out)
        out = self.se(out)
        
        out += residual
        out = self.relu(out)
        return out

class ECAPA_TDNN(nn.Module):
    """ECAPA-TDNN for Speech Emotion Recognition"""
    def __init__(self, input_size=40, num_classes=7, channels=512):
        super(ECAPA_TDNN, self).__init__()
        
        self.conv1 = nn.Conv1d(input_size, channels, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.layer1 = Res2Block(channels, channels, kernel_size=3, dilation=2)
        self.layer2 = Res2Block(channels, channels, kernel_size=3, dilation=3)
        self.layer3 = Res2Block(channels, channels, kernel_size=3, dilation=4)
        
        self.conv2 = nn.Conv1d(channels * 3, channels * 3, kernel_size=1)
        
        self.attention = nn.Sequential(
            nn.Conv1d(channels * 3, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Tanh(),
            nn.Conv1d(128, channels * 3, kernel_size=1),
            nn.Softmax(dim=2)
        )
        
        self.bn2 = nn.BatchNorm1d(channels * 6)
        self.fc = nn.Linear(channels * 6, num_classes)
    
    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, features, time)
        
        x = self.relu(self.bn1(self.conv1(x)))
        
        out1 = self.layer1(x)
        out2 = self.layer2(out1)
        out3 = self.layer3(out2)
        
        out = torch.cat([out1, out2, out3], dim=1)
        out = self.relu(self.conv2(out))
        
        # Attentive Statistics Pooling
        alpha = self.attention(out)
        mean = torch.sum(alpha * out, dim=2)
        std = torch.sqrt((torch.sum(alpha * out ** 2, dim=2) - mean ** 2).clamp(min=1e-9))
        out = torch.cat([mean, std], dim=1)
        
        out = self.bn2(out)
        out = self.fc(out)
        return out

In [None]:
# =============================================
# CELL 8: MODEL 4 & 5 - Wav2Vec2 and HuBERT (Pretrained SOTA)
# =============================================
try:
    from transformers import Wav2Vec2Model, HubertModel, Wav2Vec2Config, HubertConfig
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    print('transformers not installed. Run: pip install transformers')
    TRANSFORMERS_AVAILABLE = False

class Wav2Vec2ForSER(nn.Module):
    """Wav2Vec2 for Speech Emotion Recognition"""
    def __init__(self, num_classes=7, pretrained='facebook/wav2vec2-base'):
        super(Wav2Vec2ForSER, self).__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained)
        self.wav2vec2.freeze_feature_encoder()  # Freeze CNN encoder
        
        hidden_size = self.wav2vec2.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        outputs = self.wav2vec2(x)
        hidden_states = outputs.last_hidden_state
        pooled = hidden_states.mean(dim=1)  # Mean pooling
        return self.classifier(pooled)

class HuBERTForSER(nn.Module):
    """HuBERT for Speech Emotion Recognition"""
    def __init__(self, num_classes=7, pretrained='facebook/hubert-base-ls960'):
        super(HuBERTForSER, self).__init__()
        self.hubert = HubertModel.from_pretrained(pretrained)
        self.hubert.freeze_feature_encoder()
        
        hidden_size = self.hubert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        outputs = self.hubert(x)
        hidden_states = outputs.last_hidden_state
        pooled = hidden_states.mean(dim=1)
        return self.classifier(pooled)

In [None]:
# =============================================
# CELL 9: Training Function (SAME LOGIC AS YOUR NOTEBOOK)
# =============================================
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, model_name):
    """Training loop - SAME logic as your notebook"""
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    best_val_acc = 0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct, total = 0, 0
        
        for features, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False):
            features, labels = features.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100 * correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct, total = 0, 0
        
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100 * correct / total
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'{model_name}_best.pth')
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
    
    return train_losses, val_losses, train_accs, val_accs

In [None]:
# =============================================
# CELL 10: Evaluation Function
# =============================================
def evaluate_model(model, test_loader, le, model_name):
    """Evaluate model and return metrics"""
    model.eval()
    y_true, y_pred = [], []
    inference_times = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            
            start_time = time.time()
            outputs = model(features)
            inference_times.append((time.time() - start_time) * 1000 / features.size(0))  # ms per sample
            
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred) * 100
    f1 = f1_score(y_true, y_pred, average='weighted') * 100
    precision = precision_score(y_true, y_pred, average='weighted') * 100
    recall = recall_score(y_true, y_pred, average='weighted') * 100
    avg_inference_time = np.mean(inference_times)
    
    print(f'\n=== {model_name} Results ===')
    print(f'Accuracy: {accuracy:.2f}%')
    print(f'F1-Score: {f1:.2f}%')
    print(f'Precision: {precision:.2f}%')
    print(f'Recall: {recall:.2f}%')
    print(f'Avg Inference Time: {avg_inference_time:.2f} ms/sample')
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred, target_names=le.classes_))
    
    return {
        'model': model_name,
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'inference_time_ms': avg_inference_time,
        'y_true': y_true,
        'y_pred': y_pred
    }

In [None]:
# =============================================
# CELL 11: Load Data and Create Data Loaders (SAME AS YOUR NOTEBOOK)
# =============================================
print('Loading datasets...')

# Load data with raw audio for Wav2Vec2/HuBERT
features, raw_audios, labels, le = load_data(extract_raw=True)
print(f'Datasets loaded: {features.shape[0]} samples')

# Split data - SAME as your notebook (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Split raw audio with same indices
raw_train, raw_temp, _, _ = train_test_split(
    raw_audios, labels, test_size=0.2, random_state=42, stratify=labels
)
raw_val, raw_test, _, _ = train_test_split(
    raw_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f'Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}')

# Create DataLoaders for MFCC models
train_dataset = EmotionDataset(X_train, y_train)
val_dataset = EmotionDataset(X_val, y_val)
test_dataset = EmotionDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Create DataLoaders for raw audio models
raw_train_dataset = RawAudioDataset(raw_train, y_train)
raw_val_dataset = RawAudioDataset(raw_val, y_val)
raw_test_dataset = RawAudioDataset(raw_test, y_test)

raw_train_loader = DataLoader(raw_train_dataset, batch_size=8, shuffle=True)  # Smaller batch for large models
raw_val_loader = DataLoader(raw_val_dataset, batch_size=8, shuffle=False)
raw_test_loader = DataLoader(raw_test_dataset, batch_size=8, shuffle=False)

num_classes = len(le.classes_)
input_size = N_MFCC
hidden_size = 128
num_layers = 2

In [None]:
# =============================================
# CELL 12: Train and Evaluate All Models
# =============================================
results = []

# Model 1: Your Original CNN-BiLSTM
print('\n' + '='*60)
print('Training Model 1: CNN-BiLSTM (Your Original)')
print('='*60)
model1 = EmotionModel(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=LEARNING_RATE)
train_model(model1, train_loader, val_loader, criterion, optimizer, EPOCHS, 'CNN_BiLSTM')
model1.load_state_dict(torch.load('CNN_BiLSTM_best.pth'))
results.append(evaluate_model(model1, test_loader, le, 'CNN-BiLSTM'))

In [None]:
# Model 2: CNN-Transformer Hybrid
print('\n' + '='*60)
print('Training Model 2: CNN-Transformer Hybrid')
print('='*60)
model2 = CNNTransformerHybrid(num_classes).to(device)
optimizer = optim.Adam(model2.parameters(), lr=LEARNING_RATE)
train_model(model2, train_loader, val_loader, criterion, optimizer, EPOCHS, 'CNN_Transformer')
model2.load_state_dict(torch.load('CNN_Transformer_best.pth'))
results.append(evaluate_model(model2, test_loader, le, 'CNN-Transformer'))

In [None]:
# Model 3: ECAPA-TDNN
print('\n' + '='*60)
print('Training Model 3: ECAPA-TDNN')
print('='*60)
model3 = ECAPA_TDNN(input_size=N_MFCC, num_classes=num_classes).to(device)
optimizer = optim.Adam(model3.parameters(), lr=LEARNING_RATE)
train_model(model3, train_loader, val_loader, criterion, optimizer, EPOCHS, 'ECAPA_TDNN')
model3.load_state_dict(torch.load('ECAPA_TDNN_best.pth'))
results.append(evaluate_model(model3, test_loader, le, 'ECAPA-TDNN'))

In [None]:
# Model 4: Wav2Vec2 (if transformers available)
if TRANSFORMERS_AVAILABLE:
    print('\n' + '='*60)
    print('Training Model 4: Wav2Vec2')
    print('='*60)
    model4 = Wav2Vec2ForSER(num_classes=num_classes).to(device)
    optimizer = optim.Adam(model4.parameters(), lr=1e-4)  # Lower LR for pretrained
    train_model(model4, raw_train_loader, raw_val_loader, criterion, optimizer, 20, 'Wav2Vec2')  # Fewer epochs
    model4.load_state_dict(torch.load('Wav2Vec2_best.pth'))
    results.append(evaluate_model(model4, raw_test_loader, le, 'Wav2Vec2'))

In [None]:
# Model 5: HuBERT (if transformers available)
if TRANSFORMERS_AVAILABLE:
    print('\n' + '='*60)
    print('Training Model 5: HuBERT')
    print('='*60)
    model5 = HuBERTForSER(num_classes=num_classes).to(device)
    optimizer = optim.Adam(model5.parameters(), lr=1e-4)
    train_model(model5, raw_train_loader, raw_val_loader, criterion, optimizer, 20, 'HuBERT')
    model5.load_state_dict(torch.load('HuBERT_best.pth'))
    results.append(evaluate_model(model5, raw_test_loader, le, 'HuBERT'))

In [None]:
# =============================================
# CELL 13: Results Comparison Table
# =============================================
import pandas as pd

# Create results dataframe
results_df = pd.DataFrame([{
    'Model': r['model'],
    'Accuracy (%)': f"{r['accuracy']:.2f}",
    'F1-Score (%)': f"{r['f1_score']:.2f}",
    'Precision (%)': f"{r['precision']:.2f}",
    'Recall (%)': f"{r['recall']:.2f}",
    'Inference (ms)': f"{r['inference_time_ms']:.2f}"
} for r in results])

print('\n' + '='*80)
print('BENCHMARK RESULTS SUMMARY')
print('='*80)
print(results_df.to_string(index=False))
results_df.to_csv('benchmark_results.csv', index=False)

In [None]:
# =============================================
# CELL 14: Visualization - Performance Comparison
# =============================================
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

models = [r['model'] for r in results]
colors = plt.cm.Set2(np.linspace(0, 1, len(models)))

# Accuracy comparison
ax1 = axes[0, 0]
accuracies = [r['accuracy'] for r in results]
bars1 = ax1.bar(models, accuracies, color=colors)
ax1.set_ylabel('Accuracy (%)')
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylim([0, 100])
for bar, acc in zip(bars1, accuracies):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{acc:.1f}%', 
             ha='center', va='bottom', fontsize=9)
ax1.tick_params(axis='x', rotation=45)

# F1-Score comparison
ax2 = axes[0, 1]
f1_scores = [r['f1_score'] for r in results]
bars2 = ax2.bar(models, f1_scores, color=colors)
ax2.set_ylabel('F1-Score (%)')
ax2.set_title('Model F1-Score Comparison')
ax2.set_ylim([0, 100])
for bar, f1 in zip(bars2, f1_scores):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{f1:.1f}%', 
             ha='center', va='bottom', fontsize=9)
ax2.tick_params(axis='x', rotation=45)

# Inference time comparison
ax3 = axes[1, 0]
inf_times = [r['inference_time_ms'] for r in results]
bars3 = ax3.bar(models, inf_times, color=colors)
ax3.set_ylabel('Inference Time (ms/sample)')
ax3.set_title('Model Inference Speed Comparison')
for bar, t in zip(bars3, inf_times):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{t:.2f}', 
             ha='center', va='bottom', fontsize=9)
ax3.tick_params(axis='x', rotation=45)

# Radar chart for overall comparison
ax4 = axes[1, 1]
ax4.remove()
ax4 = fig.add_subplot(2, 2, 4, projection='polar')

categories = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
N = len(categories)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

for i, r in enumerate(results):
    values = [r['accuracy'], r['f1_score'], r['precision'], r['recall']]
    values += values[:1]
    ax4.plot(angles, values, 'o-', linewidth=2, label=r['model'], color=colors[i])
    ax4.fill(angles, values, alpha=0.1, color=colors[i])

ax4.set_xticks(angles[:-1])
ax4.set_xticklabels(categories)
ax4.set_ylim([0, 100])
ax4.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax4.set_title('Overall Performance Radar')

plt.tight_layout()
plt.savefig('benchmark_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# =============================================
# CELL 15: Confusion Matrices for All Models
# =============================================
n_models = len(results)
fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 5))

if n_models == 1:
    axes = [axes]

for ax, r in zip(axes, results):
    cm = confusion_matrix(r['y_true'], r['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=le.classes_, yticklabels=le.classes_, ax=ax)
    ax.set_title(f"{r['model']}\nAcc: {r['accuracy']:.1f}%")
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300)
plt.show()

In [None]:
# =============================================
# CELL 16: Statistical Significance Test (McNemar's Test)
# =============================================
from scipy.stats import chi2

def mcnemar_test(y_true, y_pred1, y_pred2):
    """Perform McNemar's test between two models"""
    correct1 = np.array(y_pred1) == np.array(y_true)
    correct2 = np.array(y_pred2) == np.array(y_true)
    
    # b: model1 correct, model2 wrong
    b = np.sum(correct1 & ~correct2)
    # c: model1 wrong, model2 correct
    c = np.sum(~correct1 & correct2)
    
    if b + c == 0:
        return 1.0  # No difference
    
    # McNemar's test statistic
    statistic = (abs(b - c) - 1) ** 2 / (b + c)
    p_value = 1 - chi2.cdf(statistic, df=1)
    return p_value

print('\n' + '='*60)
print('Statistical Significance Tests (McNemar)')
print('='*60)

if len(results) > 1:
    y_true = results[0]['y_true']
    for i in range(len(results)):
        for j in range(i+1, len(results)):
            p_val = mcnemar_test(y_true, results[i]['y_pred'], results[j]['y_pred'])
            sig = '***' if p_val < 0.001 else '**' if p_val < 0.01 else '*' if p_val < 0.05 else ''
            print(f"{results[i]['model']} vs {results[j]['model']}: p={p_val:.4f} {sig}")

In [None]:
# =============================================
# CELL 17: Final Summary
# =============================================
print('\n' + '='*80)
print('FINAL BENCHMARK SUMMARY')
print('='*80)

# Find best model
best_acc_idx = np.argmax([r['accuracy'] for r in results])
best_f1_idx = np.argmax([r['f1_score'] for r in results])
fastest_idx = np.argmin([r['inference_time_ms'] for r in results])

print(f"\nBest Accuracy: {results[best_acc_idx]['model']} ({results[best_acc_idx]['accuracy']:.2f}%)")
print(f"Best F1-Score: {results[best_f1_idx]['model']} ({results[best_f1_idx]['f1_score']:.2f}%)")
print(f"Fastest Model: {results[fastest_idx]['model']} ({results[fastest_idx]['inference_time_ms']:.2f} ms/sample)")

print('\n' + '='*80)
print('Results saved to: benchmark_results.csv')
print('Plots saved to: benchmark_comparison.png, confusion_matrices.png')
print('='*80)