# Speech Emotion Recognition (CNN)

Fully self-contained notebook. No external `src/` dependencies.
Covers preprocessing, EDA, training, and evaluation.

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import os


## Configuration

In [None]:

DATASET_DIR = "data/RAVDESS"  # <-- change path if needed
SR = 22050
N_MELS = 128
MAX_LEN = 128

EMOTIONS = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


## Feature Extraction

In [None]:

def extract_log_mel(file_path):
    y, _ = librosa.load(file_path, sr=SR)
    mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS)
    log_mel = librosa.power_to_db(mel)

    if log_mel.shape[1] < MAX_LEN:
        pad = MAX_LEN - log_mel.shape[1]
        log_mel = np.pad(log_mel, ((0,0),(0,pad)))
    else:
        log_mel = log_mel[:, :MAX_LEN]

    return log_mel[..., np.newaxis]


## Load Dataset

In [None]:

X, y = [], []

for root, _, files in os.walk(DATASET_DIR):
    for file in files:
        if file.endswith(".wav"):
            emotion = EMOTIONS[file.split("-")[2]]
            X.append(extract_log_mel(os.path.join(root, file)))
            y.append(list(EMOTIONS.values()).index(emotion))

X = np.array(X)
y = tf.keras.utils.to_categorical(y, num_classes=8)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y.argmax(axis=1), random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp.argmax(axis=1), random_state=42
)

X.shape, y.shape


## CNN Model

In [None]:

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=X_train.shape[1:]),
    layers.BatchNormalization(),
    layers.MaxPooling2D(),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.BatchNormalization(),
    layers.GlobalAveragePooling2D(),

    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(8, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


## Training

In [None]:

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32
)


## Training Curves

In [None]:

plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.show()


## Evaluation

In [None]:

model.evaluate(X_test, y_test)

y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

print(classification_report(y_true, y_pred, target_names=EMOTIONS.values()))

cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.show()


## Save Best Model

In [None]:

model.save("emotion_cnn.keras")
