In [28]:
# Core Audio Processing
import librosa
import soundfile as sf
import pyAudioAnalysis
from pyAudioAnalysis import audioTrainTest as aT
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import MidTermFeatures

# Numerical Processing
import numpy as np
import pandas as pd

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import matplotlib.pyplot as plt

# System utilities
import os
import warnings
warnings.filterwarnings('ignore')

# will be used to save the model to be used to make future predictions
import joblib

# Emotion (02): Encoded as:
# 01 = neutral
# 02 = calm
# 03 = happy
# 04 = sad
# 05 = angry
# 06 = fearful
# 07 = disgust
# 08 = surprised.

In [29]:
# Define data path
data_path = "data/audio/actor1"
files = []

# Iterate through each file and extract features
for file in os.listdir(data_path):
    if file.endswith(".wav"):
        file_path = os.path.join(data_path, file)
        # Parse labels from filename
        parts = file.split("-")
        emotion = int(parts[2])  # Extract emotion from filename
        actor = int(parts[-1].split(".")[0])  # Extract actor ID
        files.append({"file_path": file_path, "emotion": emotion, "actor": actor})

# Create DataFrame
data = pd.DataFrame(files)
print(data.head(20))

                                     file_path  emotion  actor
0   data/audio/actor1/03-01-06-01-02-02-02.wav        6      2
1   data/audio/actor1/03-01-05-01-02-01-16.wav        5     16
2   data/audio/actor1/03-01-08-01-01-01-14.wav        8     14
3   data/audio/actor1/03-01-06-01-02-02-16.wav        6     16
4   data/audio/actor1/03-01-05-01-02-01-02.wav        5      2
5   data/audio/actor1/03-01-01-01-02-02-06.wav        1      6
6   data/audio/actor1/03-01-02-01-02-01-12.wav        2     12
7   data/audio/actor1/03-01-01-01-02-02-12.wav        1     12
8   data/audio/actor1/03-01-02-01-02-01-06.wav        2      6
9   data/audio/actor1/03-01-02-02-01-01-06.wav        2      6
10  data/audio/actor1/03-01-02-02-01-01-12.wav        2     12
11  data/audio/actor1/03-01-06-02-01-02-16.wav        6     16
12  data/audio/actor1/03-01-05-02-01-01-02.wav        5      2
13  data/audio/actor1/03-01-08-02-02-01-14.wav        8     14
14  data/audio/actor1/03-01-06-02-01-02-02.wav        6

In [30]:
# this will give you the exact type of emotion the audio is displaying

emotion_dict = {
    1: "neutral",
    2: "calm",
    3: "happy",
    4: "sad",
    5: "angry",
    6: "fearful",
    7: "disgust",
    8: "surprised"
}

data["emotion_label"] = data["emotion"].map(emotion_dict)


In [19]:
print(data.head())

                                    file_path  emotion  actor emotion_label
0  data/audio/actor1/03-01-06-01-02-02-02.wav        6      2       fearful
1  data/audio/actor1/03-01-05-01-02-01-16.wav        5     16         angry
2  data/audio/actor1/03-01-08-01-01-01-14.wav        8     14     surprised
3  data/audio/actor1/03-01-06-01-02-02-16.wav        6     16       fearful
4  data/audio/actor1/03-01-05-01-02-01-02.wav        5      2         angry


In [35]:
# used to gather feautrues that will be used on the X test model
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, duration=2.5, offset=0.5)
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    # Compute mean of each coefficient
    mfccs_mean = mfccs.mean(axis=1)
    mfccs_std = mfccs.std()

    total = mfccs_mean + mfccs_std
    
    return mfccs_mean

# Add features to DataFrame
data["features"] = data["file_path"].apply(extract_features)

print(data.head())

                                    file_path  emotion  actor emotion_label  \
0  data/audio/actor1/03-01-06-01-02-02-02.wav        6      2       fearful   
1  data/audio/actor1/03-01-05-01-02-01-16.wav        5     16         angry   
2  data/audio/actor1/03-01-08-01-01-01-14.wav        8     14     surprised   
3  data/audio/actor1/03-01-06-01-02-02-16.wav        6     16       fearful   
4  data/audio/actor1/03-01-05-01-02-01-02.wav        5      2         angry   

                                            features  
0  [-484.8887, 38.065163, -37.226795, -2.0721314,...  
1  [-441.4606, 51.2628, -7.4304695, 3.497119, -16...  
2  [-617.22217, 55.499153, -9.51583, 9.527895, -1...  
3  [-486.03268, 51.385868, -8.118707, 5.6450877, ...  
4  [-476.39212, 65.21153, -18.103632, 3.509009, -...  


In [36]:

# Prepare features and labels
X = np.array(data["features"].tolist())
y = data["emotion"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
# Train the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=list(emotion_dict.values())))

joblib.dump(clf, 'emotion_model')

              precision    recall  f1-score   support

     neutral       0.70      0.41      0.52        17
        calm       0.43      0.71      0.54        28
       happy       0.65      0.59      0.62        37
         sad       0.65      0.49      0.56        45
       angry       0.80      0.78      0.79        50
     fearful       0.56      0.61      0.58        33
     disgust       0.45      0.58      0.51        33
   surprised       0.70      0.58      0.63        45

    accuracy                           0.61       288
   macro avg       0.62      0.59      0.59       288
weighted avg       0.63      0.61      0.61       288



['emotion_model']

Understanding the Model Used Chat to get a simple paragraph below

This data evaluates a classification model's performance in predicting emotions, showing metrics like precision, recall, and F1-score for each class (e.g., *happy, sad, angry*). Accuracy (0.61) indicates the model correctly predicted 61% of the 288 samples. Macro averages (precision: 0.60, recall: 0.60, F1: 0.59) show the unweighted mean performance across all classes, treating each equally. Weighted averages (precision: 0.62, recall: 0.61, F1: 0.61) account for class sizes, giving more weight to larger classes. The model performs best on "angry" (F1: 0.77) and struggles with "neutral" and "disgust." Overall, the model has moderate performance, with room for improvement, particularly in underrepresented classes.
