# üéôÔ∏è Speech Emotion Recognition - Training Notebook

Train the emotion recognition model on Google Colab with free GPU.

**Steps:**
1. Upload RAVDESS dataset
2. Extract features
3. Train CNN model
4. Download trained model files

## 1. Setup & Install Dependencies

In [None]:
!pip install librosa soundfile tqdm -q
print("‚úì Dependencies installed")

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Dense, Dropout, Flatten,
    BatchNormalization, GlobalAveragePooling2D
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

## 2. Upload RAVDESS Dataset

**Option A:** Upload from Kaggle (Recommended)

In [None]:
# Option A: Download from Kaggle
# First, upload your kaggle.json file
from google.colab import files
print("Upload your kaggle.json file:")
files.upload()

In [None]:
# Setup Kaggle and download dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio
!unzip -q ravdess-emotional-speech-audio.zip -d data
print("‚úì Dataset downloaded and extracted")

In [None]:
# Option B: Upload ZIP manually
# from google.colab import files
# print("Upload your RAVDESS archive.zip:")
# uploaded = files.upload()
# !unzip -q archive.zip -d data
# print("‚úì Dataset extracted")

In [None]:
# Check dataset structure
!ls data/
!ls data/ | head -5

## 3. Configuration

In [None]:
# Configuration
DATA_PATH = "data"  # Adjust if needed based on extracted structure
SAMPLE_RATE = 22050
DURATION = 3
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

EMOTIONS = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001

print("‚úì Configuration set")

## 4. Feature Extraction

In [None]:
def extract_mel_spectrogram(file_path, sr=SAMPLE_RATE, duration=DURATION):
    """Extract mel spectrogram from audio file."""
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        
        # Pad or truncate to fixed length
        max_len = sr * duration
        if len(y) < max_len:
            y = np.pad(y, (0, max_len - len(y)), mode='constant')
        else:
            y = y[:max_len]
        
        # Extract mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        return mel_spec_db
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def parse_ravdess_filename(filename):
    """Parse RAVDESS filename to extract emotion."""
    parts = filename.replace('.wav', '').split('-')
    if len(parts) != 7:
        return None
    return {
        'emotion': parts[2],
        'actor': parts[6]
    }

print("‚úì Functions defined")

In [None]:
def load_ravdess_data(data_path):
    """Load RAVDESS dataset and extract features."""
    features_list = []
    labels = []
    
    # Find all Actor directories
    actor_dirs = []
    for root, dirs, files in os.walk(data_path):
        for d in dirs:
            if d.startswith('Actor_'):
                actor_dirs.append(os.path.join(root, d))
    
    print(f"Found {len(actor_dirs)} actor directories")
    
    for actor_path in tqdm(actor_dirs, desc="Processing actors"):
        wav_files = [f for f in os.listdir(actor_path) if f.endswith('.wav')]
        
        for wav_file in wav_files:
            file_path = os.path.join(actor_path, wav_file)
            
            # Parse filename
            file_info = parse_ravdess_filename(wav_file)
            if file_info is None:
                continue
            
            # Extract features
            mel_spec = extract_mel_spectrogram(file_path)
            if mel_spec is None:
                continue
            
            # Get emotion label
            emotion_label = EMOTIONS.get(file_info['emotion'], 'unknown')
            if emotion_label == 'unknown':
                continue
            
            features_list.append(mel_spec)
            labels.append(emotion_label)
    
    X = np.array(features_list)
    y = np.array(labels)
    
    print(f"\nLoaded {len(X)} samples")
    print(f"Feature shape: {X.shape}")
    print(f"\nEmotion distribution:")
    print(pd.Series(y).value_counts())
    
    return X, y

print("‚úì Data loader defined")

In [None]:
# Load and extract features
print("Loading RAVDESS dataset...")
X, y = load_ravdess_data(DATA_PATH)
print(f"\n‚úì Features extracted: {X.shape}")

## 5. Prepare Data for Training

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)
num_classes = len(label_encoder.classes_)

print(f"Classes: {label_encoder.classes_}")
print(f"Number of classes: {num_classes}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

# Normalize
X_train = (X_train - X_train.mean()) / (X_train.std() + 1e-8)
X_test = (X_test - X_test.mean()) / (X_test.std() + 1e-8)

# Add channel dimension for CNN
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Input shape: {X_train.shape[1:]}")

## 6. Build CNN Model

In [None]:
def build_cnn_model(input_shape, num_classes):
    """Build CNN model for emotion recognition."""
    model = Sequential([
        # First Conv Block
        Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Second Conv Block
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Third Conv Block
        Conv2D(256, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(256, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Fourth Conv Block
        Conv2D(512, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(),
        
        # Dense layers
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Build model
model = build_cnn_model(X_train.shape[1:], num_classes)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

## 7. Train Model

In [None]:
# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

# Train
print("Starting training...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("\n‚úì Training complete!")

## 8. Evaluate Model

In [None]:
# Evaluate
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Predictions
y_pred = model.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(np.argmax(y_pred, axis=1))
y_true_labels = label_encoder.inverse_transform(np.argmax(y_test, axis=1))

# Classification report
print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred_labels))

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['accuracy'], label='Train')
axes[0].plot(history.history['val_accuracy'], label='Validation')
axes[0].set_title('Model Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history.history['loss'], label='Train')
axes[1].plot(history.history['val_loss'], label='Validation')
axes[1].set_title('Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true_labels, y_pred_labels, labels=label_encoder.classes_)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

## 9. Save Model Files

**Download these 3 files to your local `models/` folder**

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save model
model.save('models/emotion_model_cnn.keras')
print("‚úì Model saved: models/emotion_model_cnn.keras")

# Save label encoder
with open('models/emotion_model_cnn_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("‚úì Label encoder saved: models/emotion_model_cnn_label_encoder.pkl")

# Save config
config = {
    'model_type': 'cnn',
    'input_shape': X_train.shape[1:],
    'num_classes': num_classes
}
with open('models/emotion_model_cnn_config.pkl', 'wb') as f:
    pickle.dump(config, f)
print("‚úì Config saved: models/emotion_model_cnn_config.pkl")

# Save a dummy scaler (not used for CNN but needed for compatibility)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit([[0]])  # Dummy fit
with open('models/emotion_model_cnn_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úì Scaler saved: models/emotion_model_cnn_scaler.pkl")

print("\n" + "="*50)
print("All model files saved!")
print("="*50)

In [None]:
# Download all model files
from google.colab import files

print("Downloading model files...")
print("Save these to your local: SpeechEmotionRecognition/models/ folder")
print()

files.download('models/emotion_model_cnn.keras')
files.download('models/emotion_model_cnn_label_encoder.pkl')
files.download('models/emotion_model_cnn_config.pkl')
files.download('models/emotion_model_cnn_scaler.pkl')

print("\n‚úì All files downloaded!")

In [None]:
# Also download the plots
files.download('training_history.png')
files.download('confusion_matrix.png')
print("‚úì Plots downloaded!")

## 10. Next Steps

After downloading the model files:

1. **Copy files to your local project:**
   ```
   SpeechEmotionRecognition/
   ‚îî‚îÄ‚îÄ models/
       ‚îú‚îÄ‚îÄ emotion_model_cnn.keras
       ‚îú‚îÄ‚îÄ emotion_model_cnn_label_encoder.pkl
       ‚îú‚îÄ‚îÄ emotion_model_cnn_config.pkl
       ‚îî‚îÄ‚îÄ emotion_model_cnn_scaler.pkl
   ```

2. **Run the web app locally:**
   ```bash
   cd SpeechEmotionRecognition
   source audioML/bin/activate
   python app.py
   ```

3. **Open browser:** http://localhost:5000