# Emotion Recognition from Speech using MFCCs and CNN

This script loads pre-extracted MFCC features and their corresponding labels, builds and trains a Convolutional Neural Network (CNN) for emotion classification, and evaluates its performance.

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report


## Load and Preprocess Data
This function loads Mel-Frequency Cepstral Coefficients (MFCCs), a compact representation of the short-term power spectrum of sound. The features are normalized to zero mean and unit variance:

$$
X_{norm} = \frac{X - \mu}{\sigma}
$$

The labels are encoded using `LabelEncoder` and one-hot encoded to suit the categorical cross-entropy loss used in training.

In [None]:
def load_data(x_path="X_mfcc.npy", y_path="y_labels.npy"):
    """Load and preprocess MFCC features and labels."""
    X = np.load(x_path)
    y = np.load(y_path)

    # Normalize X
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    X = X[..., np.newaxis]  # Shape: (samples, 40, 174, 1)

    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_cat = to_categorical(y_encoded)

    return X, y_cat, le

## Train-Test Split
Splits the dataset into training and testing sets while preserving class distribution.

In [None]:
def split_data(X, y_cat, test_size=0.2, random_state=42):
    """Split data into training and test sets."""
    return train_test_split(X, y_cat, test_size=test_size, random_state=random_state, stratify=y_cat)

## Build CNN Model
Builds a deep learning model using a stack of convolutional layers followed by pooling, dropout (to reduce overfitting), and fully connected layers. The network learns local temporal-spectral features from the MFCCs, which capture emotional cues in speech.
Final activation:

$$
\hat{y} = \text{softmax}(Wx + b)
$$

used for multi-class emotion classification across 8 categories.

In [None]:
def build_model(input_shape=(40, 174, 1), num_classes=8):
    """Build a CNN model for emotion recognition."""
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

## Plot Training History
Visualizes model accuracy and loss across training epochs for both training and validation sets.

In [None]:
def plot_training_history(history):
    """Plot training and validation accuracy and loss."""
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title("Model Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title("Model Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.tight_layout()
    plt.show()

## Quantitative Evaluation
Generates predictions on the test set and evaluates them using:
- **Confusion Matrix**: to visualize class-wise prediction accuracy.
- **Classification Report**: to compute precision, recall, and F1-score.

Mathematically:
$$

F_1 = 2\cdot\frac{\text{precision}\cdot\text{recall}}{\text{precision}+\text{recall}}

$$
Provides insight into class-specific performance.

In [None]:
def evaluate_model(model, X_test, y_test, label_encoder):
    """Evaluate model and display confusion matrix and classification report."""
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    labels = label_encoder.classes_
    cm = confusion_matrix(y_true_classes, y_pred_classes)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

    print("Classification Report:\n")
    print(classification_report(y_true_classes, y_pred_classes, target_names=labels))

## Build Full Pipeline
Orchestrates the full training pipeline: data loading → preprocessing → model training → evaluation → model persistence. Acts as the script's entry point.

In [None]:
def main():
    X, y_cat, le = load_data()
    X_train, X_test, y_train, y_test = split_data(X, y_cat)

    model = build_model()
    model.summary()

    history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f"\n🎯 Test Accuracy: {test_acc:.2%}")

    plot_training_history(history)
    evaluate_model(model, X_test, y_test, le)

    model.save("emotion_cnn_model.keras")

### Run the Pipeline

In [None]:
if __name__ == "__main__":
    main()