In [6]:
%%capture
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tabulate import tabulate
from collections import defaultdict
from xgboost import XGBClassifier as XGB
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.linear_model import RidgeClassifier as Ridge
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier as ExtraTrees
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
import tensorflow as tf

In [2]:
EXTRACTED_FEATURES_FILE = 'all_extracted_features_iemocap.csv'
pd.options.display.max_columns = None
pd.options.display.max_rows = None
tf.random.set_seed(1)
plt.rcParams['figure.dpi'] = 300

In [3]:
df = pd.read_csv(EXTRACTED_FEATURES_FILE)
df = df.sort_values(['Emotion_Id', 'Gender'], ascending = (True, True))
df = df.set_index('File')
df = df[df['Emotion'].isin({'ang', 'neu', 'hap', 'exc', 'sad'})]
df = df[(1 <= df['Activation']) & (df['Activation'] <= 5) & (1 <= df['Valence']) & (df['Valence'] <= 5) & (1 <= df['Dominance']) & (df['Dominance'] <= 5)]
df.loc[df['Emotion'] == 'ang', 'Emotion'] = 'angry'
df.loc[df['Emotion'] == 'neu', 'Emotion'] = 'neutral'
df.loc[df['Emotion'] == 'exc', 'Emotion'] = 'happy'
df.loc[df['Emotion'] == 'hap', 'Emotion'] = 'happy'

df.loc[df['Emotion_Id'] == 5, 'Emotion_Id'] = 1

print(f"Number of Audio Files: {df.shape[0]}")
df.groupby(['Emotion', 'Emotion_Id']).agg({'Emotion': ['count']})

Number of Audio Files: 5526


Unnamed: 0_level_0,Unnamed: 1_level_0,Emotion
Unnamed: 0_level_1,Unnamed: 1_level_1,count
Emotion,Emotion_Id,Unnamed: 2_level_2
angry,0,1103
happy,1,1633
neutral,3,1706
sad,2,1084


In [4]:
data = df.iloc[:,8:]
features = df.columns.values[8:]
labels = df["Emotion_Id"].values

In [7]:
def categorical_cross_validation(clf, data, labels, cv=5):
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=1)
    
    X = data
    y = labels
    
    scores = []
    fold_no = 1
    for train, test in kfold.split(X, y):
        model = clf()
        
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y[train], y[test]
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        scores.append(metrics.accuracy_score(y_test, y_pred))

        fold_no = fold_no + 1

    return scores

def compare_clfs(clfs, data, labels):
    clfs_scores = defaultdict(dict)
    
    for clf in tqdm(clfs):
        scores = categorical_cross_validation(clf, data, labels, cv=5)
        
        # scores for each fold
        mean_score = np.mean(scores)
        std_score = np.std(scores)

        clfs_scores[clf().__class__.__name__.split("(")[0]] = (mean_score, std_score)
    
    return clfs_scores

def get_new_model0(): return RandomForest(random_state=1, max_features=None)
def get_new_model1(): return ExtraTrees(random_state=1, max_features=None)
def get_new_model2(): return XGB(random_state=1, objective="multi:softprob")
def get_new_model3(): return Ridge(random_state=1)
    
clfs = [
    get_new_model0,
    get_new_model1,
    get_new_model2,
    get_new_model3,
]

clfs_scores = compare_clfs(clfs, data, labels)

100%|██████████| 4/4 [04:48<00:00, 72.13s/it] 


With all features

In [8]:
print(f"Classifiers Ordered by Highest Mean Accuracy:")
print(tabulate(
    pd.DataFrame().from_dict(dict(
        sorted(clfs_scores.items(), key=lambda item: item[1][0], reverse=True)
    ), orient="index", columns=["Mean", "Std"]), headers = 'keys'
))

print(f"\nClassifiers Ordered by Lowest Standard Deviation Accuracy:")
print(tabulate(
    pd.DataFrame().from_dict(dict(
        sorted(clfs_scores.items(), key=lambda item: item[1][1])
    ), orient="index", columns=["Mean", "Std"]), headers = 'keys'
))

Classifiers Ordered by Highest Mean Accuracy:
                            Mean         Std
----------------------  --------  ----------
XGBClassifier           0.638073  0.0122026
ExtraTreesClassifier    0.625766  0.00926962
RandomForestClassifier  0.620521  0.00523204
RidgeClassifier         0.614545  0.0196593

Classifiers Ordered by Lowest Standard Deviation Accuracy:
                            Mean         Std
----------------------  --------  ----------
RandomForestClassifier  0.620521  0.00523204
ExtraTreesClassifier    0.625766  0.00926962
XGBClassifier           0.638073  0.0122026
RidgeClassifier         0.614545  0.0196593
