In [1]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

In [2]:
# ---------- Feature Extraction ----------
def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        rms = librosa.feature.rms(y=audio)
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)

        # Combine all features
        features = np.hstack([
            np.mean(mfcc.T, axis=0),
            np.mean(chroma.T, axis=0),
            np.mean(zcr.T, axis=0),
            np.mean(rms.T, axis=0),
            np.mean(contrast.T, axis=0)
        ])
        return features
    except Exception as e:
        print(f"Error in {file_path}: {e}")
        return None


In [3]:
# ---------- Emotion Label Mapping ----------
# ---------- Unified Emotion Label ----------
def unify_label(emotion):
    label_map = {
        'neutral': 'neutral',
        'calm': 'neutral',
        'happy': 'happy',
        'sad': 'sad',
        'angry': 'angry',
        'fearful': 'fearful',
        'disgust': 'disgust',
        'surprised': 'surprised',
        'fear': 'fearful'
    }
    return label_map.get(emotion.lower(), None)


In [4]:
# ---------- RAVDESS ----------
ravdess_emotion_map = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}

def get_ravdess_label(file_name):
    parts = file_name.split('-')
    if len(parts) < 3:
        return None
    return ravdess_emotion_map.get(parts[2])

def load_ravdess_features(base_path):
    features = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith('.wav'):
                emotion = get_ravdess_label(file)
                full_path = os.path.join(root, file)
                mfccs = extract_features(full_path)
                if mfccs is not None and emotion is not None:
                    features.append([mfccs, emotion])
    return features


In [5]:
# ---------- Load TESS ----------
def load_tess(base_path):
    data = []
    for folder in tqdm(os.listdir(base_path), desc="TESS"):
        folder_path = os.path.join(base_path, folder)
        if not os.path.isdir(folder_path):
            continue
        emotion = folder.split('_')[-1].lower()
        label = unify_label(emotion)
        if label is None:
            continue
        for file in os.listdir(folder_path):
            if file.endswith('.wav'):
                full_path = os.path.join(folder_path, file)
                features = extract_features(full_path)
                if features is not None:
                    data.append([features, label])
    return data


In [6]:
# ---------- Load CREMA-D ----------
def load_cremad(base_path):
    emotion_map = {
        'ANG': 'angry', 'DIS': 'disgust', 'FEA': 'fearful',
        'HAP': 'happy', 'NEU': 'neutral', 'SAD': 'sad'
    }
    data = []
    for file in tqdm(os.listdir(base_path), desc="CREMA-D"):
        if file.endswith('.wav'):
            parts = file.split('_')
            if len(parts) >= 3:
                emotion = emotion_map.get(parts[2])
                label = unify_label(emotion)
                features = extract_features(os.path.join(base_path, file))
                if features is not None and label:
                    data.append([features, label])
    return data

In [7]:
def load_savee(base_path):
    emotion_map = {
        'a': 'angry', 'd': 'disgust', 'f': 'fearful',
        'h': 'happy', 'n': 'neutral', 'sa': 'sad', 'su': 'surprised'
    }
    data = []
    for file in tqdm(os.listdir(base_path), desc="SAVEE"):
        if file.endswith('.wav'):
            fname = file[:-4].lower()
            emotion = None
            for key in emotion_map:
                if fname.startswith(key):
                    emotion = emotion_map[key]
                    break
            if emotion:
                label = unify_label(emotion)
                features = extract_features(os.path.join(base_path, file))
                if features is not None and label:
                    data.append([features, label])
    return data

In [15]:
# ---------- Load All Data ----------
cremad_path = r'Crema'
ravdess_path = r'Ravdess'
savee_path = r'Savee'
tess_path = r'Tess'

all_data = (
    load_cremad(cremad_path) +
    load_ravdess_features(ravdess_path) +
    load_savee(savee_path) +
    load_tess(tess_path) 
)

CREMA-D:   0%|          | 0/7442 [00:00<?, ?it/s]

  return pitch_tuning(
CREMA-D: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7442/7442 [05:18<00:00, 23.40it/s]
SAVEE: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 480/480 [00:18<00:00, 25.49it/s]
TESS: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [01:22<00:00,  5.92s/it]


In [16]:
# ---------- Prepare Dataset ----------
df = pd.DataFrame(all_data, columns=['features', 'emotion'])
X = np.array(df['features'].tolist())
y = df['emotion'].values


In [21]:
df.to_csv('emotion_dataset.csv', index=False)

In [17]:
# ---------- Encode Labels ----------
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [18]:
from sklearn.preprocessing import StandardScaler
# ---------- Normalize ----------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [19]:
# ---------- Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.20, stratify=y_encoded, random_state=42
)


In [20]:
from sklearn.model_selection import GridSearchCV
# ---------- Hyperparameter Tuning ----------
xgb_params = {
    'n_estimators': [500, 550],
    'max_depth': [7, 10],
    'learning_rate': [0.2, 0.1]
}
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_grid = GridSearchCV(xgb_clf, xgb_params, cv=3, n_jobs=-1, verbose=1)
xgb_grid.fit(X_train, y_train)

rf_params = {
    'n_estimators': [200, 250],
    'max_depth': [10, 20]
}
rf_clf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_clf, rf_params, cv=3, n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [10, 20], 'n_estimators': [200, 250]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,250
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# ---------- Final Ensemble ----------
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_grid.best_estimator_),
        ('rf', rf_grid.best_estimator_)
    ],
    voting='soft'
)
ensemble_model.fit(X_train, y_train)


In [15]:
# ---------- Evaluation ----------
y_pred = ensemble_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n‚úÖ Test Accuracy: {acc:.4f}")
print("üéØ Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


‚úÖ Test Accuracy: 0.6267
üéØ Classification Report:
              precision    recall  f1-score   support

       angry       0.68      0.80      0.73       334
     disgust       0.59      0.52      0.55       334
     fearful       0.67      0.51      0.58       334
       happy       0.58      0.54      0.56       335
     neutral       0.56      0.64      0.60       298
         sad       0.63      0.71      0.66       334
   surprised       1.00      1.00      1.00        40

    accuracy                           0.63      2009
   macro avg       0.67      0.67      0.67      2009
weighted avg       0.63      0.63      0.62      2009



In [16]:
import soundfile as sf

def predict_emotion_from_audio(file_path, model, scaler, label_encoder):
    try:
        print(f"\nüîç Predicting emotion for: {file_path}")
        features = extract_features(file_path)
        if features is None:
            print("‚ö†Ô∏è Could not extract features.")
            return

        # Reshape and scale
        features = np.array(features).reshape(1, -1)
        features_scaled = scaler.transform(features)

        # Predict
        prediction = model.predict(features_scaled)
        predicted_emotion = label_encoder.inverse_transform(prediction)[0]

        print(f"üé§ Predicted Emotion: {predicted_emotion}")
        return predicted_emotion

    except Exception as e:
        print(f"‚ùå Error predicting emotion: {e}")


In [17]:
# Example: Predict emotion from a custom .wav file
test_audio_path = '/kaggle/input/voice-emotion-classification/Voice Emotion Dataset/neutral/03-01-01-01-01-01-01_aug0.wav'  # replace with your test file
predict_emotion_from_audio(test_audio_path, ensemble_model, scaler, le)



üîç Predicting emotion for: /kaggle/input/voice-emotion-classification/Voice Emotion Dataset/neutral/03-01-01-01-01-01-01_aug0.wav
üé§ Predicted Emotion: disgust


'disgust'