# 04 - RNN/LSTM Model Training & Evaluation

Train and evaluate LSTM-based Recurrent Neural Network for audio classification.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

print("✓ Imports complete")

## Prepare Data for RNN/LSTM

In [None]:
# Transpose features from (batch, n_mfcc, time_steps) to (batch, time_steps, n_mfcc)
X_train_rnn = X_train.transpose(0, 2, 1)
X_val_rnn = X_val.transpose(0, 2, 1)
X_test_rnn = X_test.transpose(0, 2, 1)

# Convert to tensors
X_train_tensor_rnn = torch.tensor(X_train_rnn, dtype=torch.float32)
X_val_tensor_rnn = torch.tensor(X_val_rnn, dtype=torch.float32)
X_test_tensor_rnn = torch.tensor(X_test_rnn, dtype=torch.float32)

# Encode labels
y_train_encoded_rnn = label_encoder.transform(y_train)
y_val_encoded_rnn = label_encoder.transform(y_val)
y_test_encoded_rnn = label_encoder.transform(y_test)

y_train_tensor_rnn = torch.tensor(y_train_encoded_rnn, dtype=torch.long)
y_val_tensor_rnn = torch.tensor(y_val_encoded_rnn, dtype=torch.long)
y_test_tensor_rnn = torch.tensor(y_test_encoded_rnn, dtype=torch.long)

print(f"✓ Data prepared")
print(f"  X_train_rnn: {X_train_tensor_rnn.shape}")
print(f"  X_val_rnn: {X_val_tensor_rnn.shape}")
print(f"  X_test_rnn: {X_test_tensor_rnn.shape}")

## Define RNN/LSTM Architecture

In [None]:
class AudioRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(AudioRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layer - better than simple RNN for long-term dependencies
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_size)
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: (batch, sequence_length, hidden_size)

        # Use output of last time step
        out = self.fc(self.dropout(out[:, -1, :]))
        return out

print("✓ RNN/LSTM model defined")

## Create Dataset and DataLoader

In [None]:
class AudioDatasetRNN(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset_rnn = AudioDatasetRNN(X_train_tensor_rnn, y_train_tensor_rnn)
val_dataset_rnn = AudioDatasetRNN(X_val_tensor_rnn, y_val_tensor_rnn)
test_dataset_rnn = AudioDatasetRNN(X_test_tensor_rnn, y_test_tensor_rnn)

batch_size = 32
train_loader_rnn = DataLoader(train_dataset_rnn, batch_size=batch_size, shuffle=True)
val_loader_rnn = DataLoader(val_dataset_rnn, batch_size=batch_size, shuffle=False)
test_loader_rnn = DataLoader(test_dataset_rnn, batch_size=batch_size, shuffle=False)

print(f"✓ DataLoaders created")

## Train RNN/LSTM Model

In [None]:
input_size = X_train_rnn.shape[2]  # n_mfcc
hidden_size = 128
num_layers = 2
num_classes = len(y_train.unique())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

rnn_model = AudioRNN(input_size, hidden_size, num_layers, num_classes).to(device)

criterion_rnn = nn.CrossEntropyLoss()
optimizer_rnn = optim.Adam(rnn_model.parameters(), lr=0.001)

num_epochs_rnn = 20
train_losses_rnn = []
val_losses_rnn = []
val_accuracies_rnn = []

print("Starting RNN/LSTM model training...")
for epoch in range(num_epochs_rnn):
    rnn_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader_rnn:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer_rnn.zero_grad()
        outputs = rnn_model(inputs)
        loss = criterion_rnn(outputs, labels)
        loss.backward()
        optimizer_rnn.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_dataset_rnn)
    train_losses_rnn.append(epoch_loss)

    # Validation
    rnn_model.eval()
    running_val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, labels in val_loader_rnn:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = rnn_model(inputs)
            loss = criterion_rnn(outputs, labels)
            running_val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_dataset_rnn)
    val_losses_rnn.append(epoch_val_loss)
    epoch_val_accuracy = correct_predictions / total_predictions
    val_accuracies_rnn.append(epoch_val_accuracy)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{num_epochs_rnn}, Train Loss: {epoch_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

print("✓ RNN/LSTM Training complete")

## Evaluate on Test Set

In [None]:
rnn_model.eval()
all_predictions_rnn = []
all_true_labels_rnn = []

with torch.no_grad():
    for inputs, labels in test_loader_rnn:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = rnn_model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        all_predictions_rnn.extend(predicted.tolist())
        all_true_labels_rnn.extend(labels.tolist())

# Calculate metrics
accuracy_rnn = accuracy_score(all_true_labels_rnn, all_predictions_rnn)
precision_rnn = precision_score(all_true_labels_rnn, all_predictions_rnn, average='weighted')
recall_rnn = recall_score(all_true_labels_rnn, all_predictions_rnn, average='weighted')
f1_rnn = f1_score(all_true_labels_rnn, all_predictions_rnn, average='weighted')

cm_rnn = confusion_matrix(all_true_labels_rnn, all_predictions_rnn)
per_class_accuracy_rnn = cm_rnn.diagonal() / cm_rnn.sum(axis=1)

print("\n" + "="*60)
print("RNN/LSTM Model Evaluation Metrics")
print("="*60)
print(f"Accuracy:  {accuracy_rnn:.4f}")
print(f"Precision: {precision_rnn:.4f}")
print(f"Recall:    {recall_rnn:.4f}")
print(f"F1-score:  {f1_rnn:.4f}")
print("="*60)

## Visualize Results

In [None]:
class_labels = label_encoder.classes_

# Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_rnn, annot=True, fmt='d', cmap='Oranges', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('RNN/LSTM Confusion Matrix')
plt.tight_layout()
plt.show()

# Per-class accuracy
plt.figure(figsize=(10, 5))
plt.bar(class_labels, per_class_accuracy_rnn, color='orange', edgecolor='darkorange')
plt.xlabel('Class')
plt.ylabel('Accuracy')
plt.title('RNN/LSTM Per-Class Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Training history
plt.figure(figsize=(10, 5))
plt.plot(train_losses_rnn, label='Training Loss', linewidth=2)
plt.plot(val_losses_rnn, label='Validation Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('RNN/LSTM Training History')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()