In [2]:
import os
import librosa
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


2025-03-08 07:44:17.256946: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-08 07:44:18.355777: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-08 07:44:18.846748: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741400059.354405    5037 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741400059.545715    5037 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 07:44:20.900093: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [7]:
def extract_features(file_path, max_pad_len=188, num_mfcc=60):
    feature = None  # Initialize feature to avoid reference errors
    try:
        y, sr = librosa.load(file_path, duration=3, offset=0.5)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=num_mfcc)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)

        # Ensure both features have the same length
        min_length = min(mfccs.shape[1], mel_spec.shape[1], max_pad_len)
        mfccs = mfccs[:, :min_length]
        mel_spec = mel_spec[:, :min_length]

        # Pad if necessary
        pad_width_mfcc = max(0, max_pad_len - mfccs.shape[1])
        pad_width_mel = max(0, max_pad_len - mel_spec.shape[1])

        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width_mfcc)), mode='constant')
        mel_spec = np.pad(mel_spec, pad_width=((0, 0), (0, pad_width_mel)), mode='constant')

        feature = np.vstack((mfccs, mel_spec))[:, :128]  # Trim to (188, 128)
        print(f"Processed {file_path}: Shape {feature.shape}")
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

    return feature  # Ensures function always returns something (even if None)


In [6]:
emotions = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}

In [4]:
# Load dataset
X, Y = [], []
data_path = "Actor/"
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Data path '{data_path}' not found. Check the directory structure.")

actors = sorted(os.listdir(data_path))  # Sort actor folders for consistency
print(f"Found actor folders: {actors}")

Found actor folders: ['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']


In [8]:
for actor in actors:
    actor_folder = os.path.join(data_path, actor)
    if os.path.isdir(actor_folder):
        wav_files = sorted([f for f in os.listdir(actor_folder) if f.endswith(".wav")])
        if not wav_files:
            print(f"Warning: No .wav files found in {actor_folder}")
        for file in wav_files:
            try:
                parts = file.split("-")
                if len(parts) < 3:
                    print(f"Skipping invalid filename: {file}")
                    continue
                emotion_code = parts[2]
                if emotion_code in emotions:
                    emotion = emotions[emotion_code]
                    feature = extract_features(os.path.join(actor_folder, file))
                    if feature is not None:
                        X.append(feature)
                        Y.append(emotion)
            except Exception as e:
                print(f"Error processing file {file}: {e}")

Processed Actor/Actor_01/03-01-01-01-01-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-01-01-01-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-01-01-02-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-01-01-02-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-01-01-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-01-01-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-01-02-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-01-02-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-02-01-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-02-01-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-02-02-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-02-02-02-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-03-01-01-01-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-03-01-01-02-01.wav: Shape (188, 128)
Processed Actor/Actor_01/03-01-03-01-02-01-01.wa

In [9]:
# Check if data was loaded
if len(X) == 0 or len(Y) == 0:
    raise ValueError("No valid data was loaded. Check file paths and feature extraction.")

X = np.array(X)
Y = np.array(Y)

In [10]:
# Save extracted features
np.save("X_features.npy", X)
np.save("Y_labels.npy", Y)


In [11]:
# Encode labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
Y = keras.utils.to_categorical(Y, num_classes=len(emotions))

In [12]:
# Split dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [13]:
# Build CNN Model
model = keras.Sequential([
    layers.Conv2D(64, kernel_size=(3,3), activation='relu', input_shape=(188, 128, 1)),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.Conv2D(128, kernel_size=(3,3), activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.Conv2D(256, kernel_size=(3,3), activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.Flatten(),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(len(emotions), activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-07 15:01:14.878024: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2025-03-07 15:01:16.402370: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 77070336 exceeds 10% of free system memory.
2025-03-07 15:01:16.494165: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 77070336 exceeds 10% of free system memory.
2025-03-07 15:01:16.534161: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 77070336 exceeds 10% of free system memory.


In [14]:
# Compile Model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:

# Train Model
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=100, batch_size=32)


Epoch 1/100


2025-03-07 15:02:56.399206: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 110886912 exceeds 10% of free system memory.
2025-03-07 15:03:02.513362: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 77070336 exceeds 10% of free system memory.


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 7s/step - accuracy: 0.2058 - loss: 12.0357 - val_accuracy: 0.2465 - val_loss: 9.6558
Epoch 2/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 7s/step - accuracy: 0.3157 - loss: 9.1649 - val_accuracy: 0.4062 - val_loss: 8.6577
Epoch 3/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 7s/step - accuracy: 0.3887 - loss: 8.5173 - val_accuracy: 0.3333 - val_loss: 8.2472
Epoch 4/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 6s/step - accuracy: 0.3555 - loss: 8.1710 - val_accuracy: 0.4514 - val_loss: 7.8746
Epoch 5/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 5s/step - accuracy: 0.3999 - loss: 7.6882 - val_accuracy: 0.4688 - val_loss: 7.6297
Epoch 6/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 6s/step - accuracy: 0.5002 - loss: 7.2219 - val_accuracy: 0.4896 - val_loss: 7.5363
Epoch 7/100
[1m36/36[0m [32m━━━━━━━━

In [None]:
# Save Model
model.save("speech_emotion_model.h5")


In [8]:
# Load the saved model
model = keras.models.load_model("speech_emotion_model.h5")

# Function to predict emotion from a given audio file
def predict_emotion(file_path):
    feature = extract_features(file_path)
    if feature is not None:
        feature = np.expand_dims(feature, axis=[0, -1])  # Reshape to match model input
        prediction = model.predict(feature)
        predicted_emotion = list(emotions.values())[np.argmax(prediction)]
        print(f"Predicted emotion for {file_path}: {predicted_emotion}")
    else:
        print(f"Could not extract features for {file_path}")

# Test cases
sample_files = [
    "Actor/Actor_01/03-01-01-01-01-01-01.wav",
    "Actor/Actor_01/03-01-02-01-01-01-01.wav",
    "Actor/Actor_01/03-01-03-01-01-01-01.wav"
]

for file in sample_files:
    if os.path.exists(file):
        predict_emotion(file)
    else:
        print(f"Test file {file} not found.")



Processed Actor/Actor_01/03-01-01-01-01-01-01.wav: Shape (188, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step
Predicted emotion for Actor/Actor_01/03-01-01-01-01-01-01.wav: fearful
Processed Actor/Actor_01/03-01-02-01-01-01-01.wav: Shape (188, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Predicted emotion for Actor/Actor_01/03-01-02-01-01-01-01.wav: calm
Processed Actor/Actor_01/03-01-03-01-01-01-01.wav: Shape (188, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Predicted emotion for Actor/Actor_01/03-01-03-01-01-01-01.wav: angry
