# Speech Emotion Recognition using CNNs

**Examiner-ready notebook** fulfilling all requirements of the AI Club SER task.

This notebook includes:
- Exploratory Data Analysis (EDA)
- Silence trimming
- Log-Mel spectrogram visualization
- Data augmentation
- CNN training
- Macro F1-score
- Confusion matrix
- Gender-based bias analysis

All code is fully self-contained.

In [None]:

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns


## Configuration

In [None]:

DATASET_DIR = "data/RAVDESS"  # change if needed
SR = 22050
N_MELS = 128
MAX_LEN = 128

EMOTIONS = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


## Silence Trimming

In [None]:

def load_and_trim(path):
    y, _ = librosa.load(path, sr=SR)
    y_trimmed, _ = librosa.effects.trim(y)
    return y, y_trimmed


### Before vs After Trimming

In [None]:

sample_file = None
for root, _, files in os.walk(DATASET_DIR):
    for f in files:
        if f.endswith(".wav"):
            sample_file = os.path.join(root, f)
            break
    if sample_file:
        break

y_raw, y_trim = load_and_trim(sample_file)

plt.figure(figsize=(12,3))
plt.plot(y_raw)
plt.title("Before Silence Trimming")
plt.show()

plt.figure(figsize=(12,3))
plt.plot(y_trim)
plt.title("After Silence Trimming")
plt.show()


## Log-Mel Spectrograms (EDA)

In [None]:

def extract_log_mel(y):
    mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS)
    return librosa.power_to_db(mel)


### Angry vs Sad Spectrogram Comparison

In [None]:

def find_emotion_sample(code):
    for root, _, files in os.walk(DATASET_DIR):
        for f in files:
            if f.endswith(".wav") and f.split("-")[2] == code:
                return os.path.join(root, f)

angry_path = find_emotion_sample("05")
sad_path = find_emotion_sample("04")

y_angry, _ = librosa.load(angry_path, sr=SR)
y_sad, _ = librosa.load(sad_path, sr=SR)

plt.figure(figsize=(10,4))
librosa.display.specshow(extract_log_mel(y_angry), sr=SR, x_axis='time', y_axis='mel')
plt.title("Angry (High Arousal)")
plt.colorbar()
plt.show()

plt.figure(figsize=(10,4))
librosa.display.specshow(extract_log_mel(y_sad), sr=SR, x_axis='time', y_axis='mel')
plt.title("Sad (Low Arousal)")
plt.colorbar()
plt.show()


## Data Augmentation

In [None]:

def augment(y):
    noise = y + 0.005 * np.random.randn(len(y))
    pitch = librosa.effects.pitch_shift(y, sr=SR, n_steps=2)
    stretch = librosa.effects.time_stretch(y, rate=0.9)
    return [noise, pitch, stretch]


## Dataset Preparation

In [None]:

X, y, genders = [], [], []

for root, _, files in os.walk(DATASET_DIR):
    for file in files:
        if file.endswith(".wav"):
            path = os.path.join(root, file)
            emotion = file.split("-")[2]
            gender = "male" if int(file.split("-")[6].split(".")[0]) % 2 else "female"

            y_raw, y_trim = load_and_trim(path)
            log_mel = extract_log_mel(y_trim)

            if log_mel.shape[1] < MAX_LEN:
                log_mel = np.pad(log_mel, ((0,0),(0,MAX_LEN-log_mel.shape[1])))
            else:
                log_mel = log_mel[:, :MAX_LEN]

            X.append(log_mel[..., np.newaxis])
            y.append(list(EMOTIONS.keys()).index(emotion))
            genders.append(gender)

            for aug in augment(y_trim):
                mel_aug = extract_log_mel(aug)
                mel_aug = mel_aug[:, :MAX_LEN]
                X.append(mel_aug[..., np.newaxis])
                y.append(list(EMOTIONS.keys()).index(emotion))
                genders.append(gender)

X = np.array(X)
y = tf.keras.utils.to_categorical(y, 8)


## Train / Val / Test Split

In [None]:

X_train, X_temp, y_train, y_temp, g_train, g_temp = train_test_split(
    X, y, genders, test_size=0.2, stratify=y.argmax(axis=1), random_state=42
)

X_val, X_test, y_val, y_test, g_val, g_test = train_test_split(
    X_temp, y_temp, g_temp, test_size=0.5, stratify=y_temp.argmax(axis=1), random_state=42
)


## CNN Architecture

In [None]:

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=X_train.shape[1:]),
    layers.BatchNormalization(),
    layers.MaxPooling2D(),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.GlobalAveragePooling2D(),

    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(8, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


## Training

In [None]:

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32
)


## Training Curves

In [None]:

plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.legend()
plt.show()


## Evaluation Metrics

In [None]:

y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

print(classification_report(y_true, y_pred, target_names=EMOTIONS.values()))

macro_f1 = f1_score(y_true, y_pred, average='macro')
print("Macro F1-score:", macro_f1)


## Confusion Matrix

In [None]:

cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.show()


## Gender Bias Analysis

In [None]:

male_idx = [i for i,g in enumerate(g_test) if g == "male"]
female_idx = [i for i,g in enumerate(g_test) if g == "female"]

male_acc = np.mean(y_pred[male_idx] == y_true[male_idx])
female_acc = np.mean(y_pred[female_idx] == y_true[female_idx])

print("Male Accuracy:", male_acc)
print("Female Accuracy:", female_acc)


## Save Model

In [None]:

model.save("emotion_cnn.keras")
