In [2]:
pip install librosa

Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.0-cp310-cp310-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp310-cp310-win_amd64.whl.metadata (8.6 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.44.0-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Do

In [12]:
pip install pyaudio

Collecting pyaudioNote: you may need to restart the kernel to use updated packages.

  Downloading PyAudio-0.2.14-cp310-cp310-win_amd64.whl.metadata (2.7 kB)
Downloading PyAudio-0.2.14-cp310-cp310-win_amd64.whl (164 kB)
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14


In [7]:
import os

data_path = r"C:\Users\akram\Downloads\Audio_Song_Actors_01-24"

# List only .wav files
wav_files = [f for f in os.listdir(data_path) if f.endswith(".wav")]
print(f"Total .wav files: {len(wav_files)}")
print(f"Sample files: {wav_files[:5]}")  # Show first 5 files


Total .wav files: 0
Sample files: []


In [8]:
import os

data_path = r"C:\Users\akram\Downloads\Audio_Song_Actors_01-24"

# List all subdirectories
subdirs = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]
print(f"Subdirectories: {subdirs}")

# Check files inside the first subdirectory
if subdirs:
    first_subdir = os.path.join(data_path, subdirs[0])
    files = os.listdir(first_subdir)
    print(f"Files in {subdirs[0]}: {files[:5]}")  # Show first 5 files


Subdirectories: ['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']
Files in Actor_01: ['03-02-01-01-01-01-01.wav', '03-02-01-01-01-02-01.wav', '03-02-01-01-02-01-01.wav', '03-02-01-01-02-02-01.wav', '03-02-02-01-01-01-01.wav']


In [9]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Feature extraction function
def extract_features(file_path, max_pad_len=174):
    try:
        audio, sample_rate = librosa.load(file_path, sr=None)  # Load audio
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)  # Extract MFCCs
        pad_width = max_pad_len - mfccs.shape[1]  # Pad or trim to fixed length
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Dataset path
data_path = r"C:\Users\akram\Downloads\Audio_Song_Actors_01-24"

# Emotion labels from filename
emotions = {"01": "Neutral", "02": "Calm", "03": "Happy", "04": "Sad", 
            "05": "Angry", "06": "Fearful", "07": "Disgust", "08": "Surprised"}

features, labels = [], []

# Recursively scan all subdirectories
for subdir in os.listdir(data_path):
    subdir_path = os.path.join(data_path, subdir)
    if os.path.isdir(subdir_path):  # Only process folders
        for file in os.listdir(subdir_path):
            if file.endswith(".wav"):
                try:
                    emotion_label = emotions[file.split("-")[2]]  # Extract emotion from filename
                    file_path = os.path.join(subdir_path, file)
                    feature = extract_features(file_path)
                    if feature is not None:
                        features.append(feature)
                        labels.append(emotion_label)
                except KeyError:
                    print(f"Skipping unknown emotion label in file: {file}")

# Convert lists to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Convert emotions to numerical labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reshape for LSTM (samples, timesteps, features, 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

# Output dataset info
print(f"Dataset loaded: {len(X_train)} train samples, {len(X_test)} test samples")
print(f"Classes: {label_encoder.classes_}")


Dataset loaded: 809 train samples, 203 test samples
Classes: ['Angry' 'Calm' 'Fearful' 'Happy' 'Neutral' 'Sad']


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten, Conv2D, MaxPooling2D

# Build model
model = Sequential([
    Conv2D(32, (3, 3), activation="relu", input_shape=(40, 174, 1)),  # CNN Layer
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(len(np.unique(y)), activation="softmax")  # Output layer
])

# Compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

# Save model
model.save("voice_emotion_model.h5")
print("Model training complete and saved!")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 87ms/step - accuracy: 0.1819 - loss: 94.3556 - val_accuracy: 0.1823 - val_loss: 1.7905
Epoch 2/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 78ms/step - accuracy: 0.1801 - loss: 1.7866 - val_accuracy: 0.1823 - val_loss: 1.8130
Epoch 3/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 80ms/step - accuracy: 0.1745 - loss: 1.8134 - val_accuracy: 0.1823 - val_loss: 1.7857
Epoch 4/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 84ms/step - accuracy: 0.1859 - loss: 1.7841 - val_accuracy: 0.1823 - val_loss: 1.7835
Epoch 5/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 78ms/step - accuracy: 0.1850 - loss: 1.7834 - val_accuracy: 0.1823 - val_loss: 1.7816
Epoch 6/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.1949 - loss: 1.7855 - val_accuracy: 0.1823 - val_loss: 1.7798
Epoch 7/50
[1m51/51[0m [32m━━━



Model training complete and saved!


In [27]:
import pyaudio
import wave

def record_audio(output_file, duration=3, sample_rate=22050):
    """Record audio for a given duration and save it as a WAV file."""
    chunk = 1024
    format = pyaudio.paInt16
    channels = 1

    audio = pyaudio.PyAudio()
    stream = audio.open(format=format, channels=channels, rate=sample_rate, input=True, frames_per_buffer=chunk)

    print("Recording...")
    frames = []
    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording complete!")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    wf = wave.open(output_file, "wb")
    wf.setnchannels(channels)
    wf.setsampwidth(audio.get_sample_size(format))
    wf.setframerate(sample_rate)
    wf.writeframes(b"".join(frames))
    wf.close()

def predict_emotion(file_path):
    """Predict emotion from an audio file."""
    model = tf.keras.models.load_model("voice_emotion_model.h5")
    feature = extract_features(file_path)
    if feature is not None:
        feature = feature.reshape(1, feature.shape[0], feature.shape[1], 1)
        prediction = model.predict(feature)
        emotion = label_encoder.inverse_transform([np.argmax(prediction)])[0]
        print(f"Predicted Emotion: {emotion}")
    else:
        print("Error in extracting features!")

# Record and predict emotion
record_audio("test_audio.wav", duration=3)
predict_emotion("test_audio.wav")

Recording...




Recording complete!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Predicted Emotion: Calm


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM, Dense, Dropout, TimeDistributed

model = Sequential([
    Conv2D(32, (3, 3), activation="relu", input_shape=(40, 174, 1)),  # CNN feature extraction
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.2),
    
    Conv2D(64, (3, 3), activation="relu"),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.2),
    
    TimeDistributed(Flatten()),  # Converts CNN features for LSTM
    LSTM(64, return_sequences=True),  
    LSTM(64),  

    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(len(np.unique(y)), activation="softmax")  # Output layer
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
import time
import pyaudio
import wave

def record_audio(output_file, duration=3, sample_rate=44100):
    """Record audio for a given duration and save it as a WAV file."""
    chunk = 1024
    format = pyaudio.paInt16
    channels = 1

    audio = pyaudio.PyAudio()
    stream = audio.open(format=format, channels=channels, rate=sample_rate, input=True, frames_per_buffer=chunk)

    print("Starting in 2 seconds... Speak clearly.")
    time.sleep(2)

    print("Recording...")
    frames = []
    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording complete!")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    wf = wave.open(output_file, "wb")
    wf.setnchannels(channels)
    wf.setsampwidth(audio.get_sample_size(format))
    wf.setframerate(sample_rate)
    wf.writeframes(b"".join(frames))
    wf.close()
