In [8]:
import librosa 
import numpy as np

def extract_features(file_path, max_pad_len=174):
    try:
        # sr = 22050
        # desired_length = sr*4
        audio, sample_rate = librosa.load(file_path, sr=None, res_type='kaiser_fast') 
        # librosa.util.fix_length(audio, size=desired_length)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        
        # Pad or truncate
        if mfccs.shape[1] < max_pad_len:
            pad_width = max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
            
        return mfccs
    except Exception as e:
        print(f"❌ Error: {e} | File: {file_path}")
        return None


In [9]:
import glob, os

data = []
labels = []

emotion_map = {
    "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}

files = glob.glob("../data/raw/kaggle_speech/**/*.wav", recursive=True)
files = [os.path.normpath(f) for f in files]  # normalize slashes

for file in files:
    file = os.path.normpath(file)
    file_name = os.path.splitext(os.path.basename(file))[0]  # remove .wav
    emotion = file_name.split("-")[2]  # get emotion code
    label = emotion_map[emotion]

    feature = extract_features(file)
    if feature is not None:
        data.append(feature)
        labels.append(label)



In [16]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X = np.array(data)
y = np.array(labels)

lb = LabelEncoder()
y = to_categorical(lb.fit_transform(y))

import joblib

# Save label encoder
joblib.dump(lb, "../models/speech_label_encoder.pkl")
print("✅ Label encoder saved as label_encoder.pkl")

# Load later
le_loaded = joblib.load("../models/speech_label_encoder.pkl")



✅ Label encoder saved as label_encoder.pkl


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# CNN expects 4D input
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(40, 174, 1)),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y.shape[1], activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=32,
                    validation_data=(X_test, y_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc)


In [14]:
from tensorflow.keras.models import load_model

# ==== SAVE MODEL ====
# After training
model.save("../models/speech_emotion_recognition_model.h5")
print("✅ Model saved as emotion_recognition_model.h5")

# ==== LOAD MODEL ====
# In a new script / notebook later
loaded_model = load_model("../models/speech_emotion_recognition_model.h5")

# Check if it loads correctly
loss, acc = loaded_model.evaluate(X_test, y_test, verbose=0)
print(f"🔁 Reloaded model accuracy: {acc:.4f}")


✅ Model saved as emotion_recognition_model.h5
🔁 Reloaded model accuracy: 0.8854
