# üéôÔ∏è Speech Emotion Recognition - Train All Models

Train **3 different models** on Google Colab with free GPU:
1. **CNN** - Convolutional Neural Network
2. **CNN-LSTM** - Hybrid model with temporal features
3. **LSTM** - Recurrent network for sequential features

**Steps:**
1. Upload RAVDESS dataset
2. Extract features
3. Train all 3 models
4. Compare results
5. Download trained model files

## 1. Setup & Install Dependencies

In [None]:
!pip install librosa soundfile tqdm -q
print("‚úì Dependencies installed")

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Dense, Dropout, Flatten,
    BatchNormalization, GlobalAveragePooling2D,
    LSTM, Bidirectional, Input, Reshape
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

## 2. Upload RAVDESS Dataset

**Option A:** Download from Kaggle using API key

In [None]:
# Option A: Download from Kaggle using API credentials
# Enter your Kaggle username and API key below (from kaggle.com/settings ‚Üí API)

KAGGLE_USERNAME = "your_username_here"  # <-- Replace with your Kaggle username
KAGGLE_KEY = "your_api_key_here"        # <-- Replace with your API key

# Create kaggle.json from credentials
kaggle_creds = {"username": KAGGLE_USERNAME, "key": KAGGLE_KEY}
with open("kaggle.json", "w") as f:
    json.dump(kaggle_creds, f)

print(f"‚úì Kaggle credentials set for user: {KAGGLE_USERNAME}")

In [None]:
# Setup Kaggle and download dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio
!unzip -q ravdess-emotional-speech-audio.zip -d data
print("‚úì Dataset downloaded and extracted")

In [None]:
# Option B: Upload ZIP manually (skip Kaggle entirely)
# Uncomment the lines below if you prefer to upload your own dataset

# from google.colab import files
# print("Upload your RAVDESS archive.zip:")
# uploaded = files.upload()
# !unzip -q archive.zip -d data
# print("‚úì Dataset extracted")

In [None]:
# Check dataset structure
!ls data/
!ls data/ | head -5

## 3. Configuration

In [None]:
# Configuration
DATA_PATH = "data"
SAMPLE_RATE = 22050
DURATION = 3
N_MFCC = 40
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

EMOTIONS = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001

# Store results for comparison
results = {}

print("‚úì Configuration set")

## 4. Feature Extraction Functions

In [None]:
def extract_mel_spectrogram(file_path, sr=SAMPLE_RATE, duration=DURATION):
    """Extract mel spectrogram for CNN/CNN-LSTM models."""
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        max_len = sr * duration
        if len(y) < max_len:
            y = np.pad(y, (0, max_len - len(y)), mode='constant')
        else:
            y = y[:max_len]
        
        mel_spec = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec_db
    except Exception as e:
        print(f"Error: {e}")
        return None


def extract_combined_features(file_path, sr=SAMPLE_RATE, duration=DURATION):
    """Extract combined features for LSTM model."""
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        max_len = sr * duration
        if len(y) < max_len:
            y = np.pad(y, (0, max_len - len(y)), mode='constant')
        else:
            y = y[:max_len]
        
        # MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)
        
        # Mel spectrogram stats
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_mean = np.mean(mel_spec_db, axis=1)
        mel_std = np.std(mel_spec_db, axis=1)
        
        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_std = np.std(chroma, axis=1)
        
        # Spectral contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
        contrast_mean = np.mean(contrast, axis=1)
        contrast_std = np.std(contrast, axis=1)
        
        # ZCR and RMS
        zcr = librosa.feature.zero_crossing_rate(y)
        rms = librosa.feature.rms(y=y)
        
        features = np.concatenate([
            mfccs_mean, mfccs_std,
            mel_mean, mel_std,
            chroma_mean, chroma_std,
            contrast_mean, contrast_std,
            [np.mean(zcr), np.std(zcr)],
            [np.mean(rms), np.std(rms)]
        ])
        return features
    except Exception as e:
        print(f"Error: {e}")
        return None


def parse_ravdess_filename(filename):
    """Parse RAVDESS filename to extract emotion."""
    parts = filename.replace('.wav', '').split('-')
    if len(parts) != 7:
        return None
    return {'emotion': parts[2], 'actor': parts[6]}

print("‚úì Feature extraction functions defined")

In [None]:
def load_ravdess_data(data_path, feature_type='mel'):
    """Load RAVDESS dataset with specified feature type."""
    features_list = []
    labels = []
    
    # Find all Actor directories
    actor_dirs = []
    for root, dirs, files in os.walk(data_path):
        for d in dirs:
            if d.startswith('Actor_'):
                actor_dirs.append(os.path.join(root, d))
    
    print(f"Found {len(actor_dirs)} actor directories")
    
    for actor_path in tqdm(actor_dirs, desc=f"Extracting {feature_type} features"):
        wav_files = [f for f in os.listdir(actor_path) if f.endswith('.wav')]
        
        for wav_file in wav_files:
            file_path = os.path.join(actor_path, wav_file)
            file_info = parse_ravdess_filename(wav_file)
            if file_info is None:
                continue
            
            # Extract features based on type
            if feature_type == 'mel':
                feat = extract_mel_spectrogram(file_path)
            else:
                feat = extract_combined_features(file_path)
            
            if feat is None:
                continue
            
            emotion_label = EMOTIONS.get(file_info['emotion'], 'unknown')
            if emotion_label == 'unknown':
                continue
            
            features_list.append(feat)
            labels.append(emotion_label)
    
    X = np.array(features_list)
    y = np.array(labels)
    
    print(f"Loaded {len(X)} samples, shape: {X.shape}")
    return X, y

print("‚úì Data loader defined")

## 5. Load Data (Both Feature Types)

In [None]:
# Load mel spectrogram features (for CNN and CNN-LSTM)
print("="*60)
print("Loading MEL SPECTROGRAM features...")
print("="*60)
X_mel, y = load_ravdess_data(DATA_PATH, feature_type='mel')

print(f"\n‚úì Mel features: {X_mel.shape}")
print(f"\nEmotion distribution:")
print(pd.Series(y).value_counts())

In [None]:
# Load combined features (for LSTM)
print("="*60)
print("Loading COMBINED features for LSTM...")
print("="*60)
X_combined, _ = load_ravdess_data(DATA_PATH, feature_type='combined')

print(f"\n‚úì Combined features: {X_combined.shape}")

## 6. Prepare Data

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)
num_classes = len(label_encoder.classes_)

print(f"Classes: {label_encoder.classes_}")
print(f"Number of classes: {num_classes}")

# Split for mel features (CNN, CNN-LSTM)
X_mel_train, X_mel_test, y_train, y_test = train_test_split(
    X_mel, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

# Split for combined features (LSTM)
X_comb_train, X_comb_test, _, _ = train_test_split(
    X_combined, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

# Normalize mel features
X_mel_train_norm = (X_mel_train - X_mel_train.mean()) / (X_mel_train.std() + 1e-8)
X_mel_test_norm = (X_mel_test - X_mel_test.mean()) / (X_mel_test.std() + 1e-8)

# Add channel dimension for CNN
X_mel_train_cnn = X_mel_train_norm[..., np.newaxis]
X_mel_test_cnn = X_mel_test_norm[..., np.newaxis]

# Normalize combined features with scaler
scaler = StandardScaler()
X_comb_train_norm = scaler.fit_transform(X_comb_train)
X_comb_test_norm = scaler.transform(X_comb_test)

# Reshape for LSTM (samples, timesteps, features)
X_comb_train_lstm = X_comb_train_norm.reshape(X_comb_train_norm.shape[0], 1, X_comb_train_norm.shape[1])
X_comb_test_lstm = X_comb_test_norm.reshape(X_comb_test_norm.shape[0], 1, X_comb_test_norm.shape[1])

print(f"\nTraining samples: {len(X_mel_train)}")
print(f"Test samples: {len(X_mel_test)}")
print(f"\nCNN input shape: {X_mel_train_cnn.shape[1:]}")
print(f"LSTM input shape: {X_comb_train_lstm.shape[1:]}")

## 7. Define All Model Architectures

In [None]:
def build_cnn_model(input_shape, num_classes):
    """CNN model for mel spectrogram input."""
    model = Sequential([
        Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(256, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(256, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(512, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(),
        
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model


def build_cnn_lstm_model(input_shape, num_classes):
    """CNN-LSTM hybrid model."""
    inputs = Input(shape=input_shape)
    
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    
    # Reshape for LSTM
    shape = x.shape
    x = Reshape((shape[1], shape[2] * shape[3]))(x)
    
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(64))(x)
    x = Dropout(0.3)(x)
    
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
    outputs = Dense(num_classes, activation='softmax')(x)
    
    return Model(inputs=inputs, outputs=outputs)


def build_lstm_model(input_shape, num_classes):
    """LSTM model for 1D features."""
    model = Sequential([
        LSTM(256, return_sequences=True, input_shape=input_shape),
        Dropout(0.3),
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        LSTM(64),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

print("‚úì All model architectures defined")

## 8. Training Function

In [None]:
def train_and_evaluate(model, model_name, X_train, X_test, y_train, y_test):
    """Train model and return results."""
    print(f"\n{'='*60}")
    print(f"Training {model_name.upper()} Model")
    print(f"{'='*60}")
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
    ]
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )
    
    # Evaluate
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"\n{model_name} - Test Accuracy: {accuracy:.4f}")
    
    # Predictions
    y_pred = model.predict(X_test, verbose=0)
    y_pred_labels = label_encoder.inverse_transform(np.argmax(y_pred, axis=1))
    y_true_labels = label_encoder.inverse_transform(np.argmax(y_test, axis=1))
    
    return {
        'model': model,
        'history': history,
        'accuracy': accuracy,
        'loss': loss,
        'y_pred': y_pred_labels,
        'y_true': y_true_labels
    }

print("‚úì Training function defined")

## 9. Train All Models

In [None]:
# Train CNN
cnn_model = build_cnn_model(X_mel_train_cnn.shape[1:], num_classes)
results['cnn'] = train_and_evaluate(
    cnn_model, 'CNN',
    X_mel_train_cnn, X_mel_test_cnn, y_train, y_test
)

In [None]:
# Train CNN-LSTM
cnn_lstm_model = build_cnn_lstm_model(X_mel_train_cnn.shape[1:], num_classes)
results['cnn_lstm'] = train_and_evaluate(
    cnn_lstm_model, 'CNN-LSTM',
    X_mel_train_cnn, X_mel_test_cnn, y_train, y_test
)

In [None]:
# Train LSTM
lstm_model = build_lstm_model(X_comb_train_lstm.shape[1:], num_classes)
results['lstm'] = train_and_evaluate(
    lstm_model, 'LSTM',
    X_comb_train_lstm, X_comb_test_lstm, y_train, y_test
)

## 10. Compare Results

In [None]:
# Summary comparison
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)

comparison_df = pd.DataFrame({
    'Model': ['CNN', 'CNN-LSTM', 'LSTM'],
    'Test Accuracy': [
        f"{results['cnn']['accuracy']:.4f}",
        f"{results['cnn_lstm']['accuracy']:.4f}",
        f"{results['lstm']['accuracy']:.4f}"
    ],
    'Test Loss': [
        f"{results['cnn']['loss']:.4f}",
        f"{results['cnn_lstm']['loss']:.4f}",
        f"{results['lstm']['loss']:.4f}"
    ]
})

print(comparison_df.to_string(index=False))

# Find best model
best_model = max(results.keys(), key=lambda k: results[k]['accuracy'])
print(f"\nüèÜ Best Model: {best_model.upper()} with {results[best_model]['accuracy']:.4f} accuracy")

In [None]:
# Plot training history for all models
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, (name, res) in enumerate(results.items()):
    # Accuracy
    axes[0, idx].plot(res['history'].history['accuracy'], label='Train')
    axes[0, idx].plot(res['history'].history['val_accuracy'], label='Val')
    axes[0, idx].set_title(f'{name.upper()} Accuracy')
    axes[0, idx].set_xlabel('Epoch')
    axes[0, idx].set_ylabel('Accuracy')
    axes[0, idx].legend()
    axes[0, idx].grid(True)
    
    # Loss
    axes[1, idx].plot(res['history'].history['loss'], label='Train')
    axes[1, idx].plot(res['history'].history['val_loss'], label='Val')
    axes[1, idx].set_title(f'{name.upper()} Loss')
    axes[1, idx].set_xlabel('Epoch')
    axes[1, idx].set_ylabel('Loss')
    axes[1, idx].legend()
    axes[1, idx].grid(True)

plt.tight_layout()
plt.savefig('all_models_training_history.png', dpi=150)
plt.show()

In [None]:
# Confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, (name, res) in enumerate(results.items()):
    cm = confusion_matrix(res['y_true'], res['y_pred'], labels=label_encoder.classes_)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    axes[idx].set_title(f'{name.upper()} Confusion Matrix')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('all_models_confusion_matrices.png', dpi=150)
plt.show()

In [None]:
# Classification reports
for name, res in results.items():
    print(f"\n{'='*60}")
    print(f"{name.upper()} Classification Report")
    print(f"{'='*60}")
    print(classification_report(res['y_true'], res['y_pred']))

## 11. Save All Models

In [None]:
os.makedirs('models', exist_ok=True)

# Save each model
for name, res in results.items():
    # Save model
    res['model'].save(f'models/emotion_model_{name}.keras')
    print(f"‚úì Saved: models/emotion_model_{name}.keras")
    
    # Save config
    if name == 'lstm':
        input_shape = X_comb_train_lstm.shape[1:]
    else:
        input_shape = X_mel_train_cnn.shape[1:]
    
    config = {
        'model_type': name,
        'input_shape': input_shape,
        'num_classes': num_classes,
        'accuracy': float(res['accuracy'])
    }
    with open(f'models/emotion_model_{name}_config.pkl', 'wb') as f:
        pickle.dump(config, f)
    print(f"‚úì Saved: models/emotion_model_{name}_config.pkl")

# Save label encoder (same for all)
with open('models/emotion_model_cnn_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
with open('models/emotion_model_cnn_lstm_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
with open('models/emotion_model_lstm_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("‚úì Saved: label encoders for all models")

# Save scaler (for LSTM)
with open('models/emotion_model_cnn_scaler.pkl', 'wb') as f:
    pickle.dump(StandardScaler().fit([[0]]), f)  # Dummy for CNN
with open('models/emotion_model_cnn_lstm_scaler.pkl', 'wb') as f:
    pickle.dump(StandardScaler().fit([[0]]), f)  # Dummy for CNN-LSTM
with open('models/emotion_model_lstm_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)  # Real scaler for LSTM
print("‚úì Saved: scalers for all models")

print("\n" + "="*60)
print("All models saved!")
print("="*60)

In [None]:
# List all saved files
!ls -la models/

## 12. Download All Model Files

In [None]:
# Zip all models for easy download
!zip -r trained_models.zip models/
print("‚úì Created trained_models.zip")

In [None]:
# Download the zip file
from google.colab import files

print("Downloading all trained models...")
print("Extract to your local: SpeechEmotionRecognition/models/ folder")
print()

files.download('trained_models.zip')
print("\n‚úì Download complete!")

In [None]:
# Also download plots
files.download('all_models_training_history.png')
files.download('all_models_confusion_matrices.png')
print("‚úì Plots downloaded!")

## 13. Next Steps

After downloading `trained_models.zip`:

1. **Extract to your local project:**
   ```bash
   cd ~/Downloads/Sem-1/SpeechEmotionRecognition
   unzip ~/Downloads/trained_models.zip
   ```

2. **Your models folder should have:**
   ```
   models/
   ‚îú‚îÄ‚îÄ emotion_model_cnn.keras
   ‚îú‚îÄ‚îÄ emotion_model_cnn_config.pkl
   ‚îú‚îÄ‚îÄ emotion_model_cnn_label_encoder.pkl
   ‚îú‚îÄ‚îÄ emotion_model_cnn_scaler.pkl
   ‚îú‚îÄ‚îÄ emotion_model_cnn_lstm.keras
   ‚îú‚îÄ‚îÄ emotion_model_cnn_lstm_config.pkl
   ‚îú‚îÄ‚îÄ emotion_model_cnn_lstm_label_encoder.pkl
   ‚îú‚îÄ‚îÄ emotion_model_cnn_lstm_scaler.pkl
   ‚îú‚îÄ‚îÄ emotion_model_lstm.keras
   ‚îú‚îÄ‚îÄ emotion_model_lstm_config.pkl
   ‚îú‚îÄ‚îÄ emotion_model_lstm_label_encoder.pkl
   ‚îî‚îÄ‚îÄ emotion_model_lstm_scaler.pkl
   ```

3. **Run the web app:**
   ```bash
   source audioML/bin/activate
   python app.py
   ```

4. **Open browser:** http://localhost:5000

The app uses CNN by default. To use a different model, modify `voice_assistant.py`.