In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import pickle

In [214]:
with open('raw_df.pkl', 'rb') as f:
    audio_df = pickle.load(f)

In [215]:
audio_df.head()

Unnamed: 0,mfcc_mean1,mfcc_mean2,mfcc_mean3,mfcc_mean4,mfcc_mean5,mfcc_mean6,mfcc_mean7,mfcc_mean8,mfcc_mean9,mfcc_mean10,...,chroma_std7,chroma_std8,chroma_std9,chroma_std10,chroma_std11,chroma_std12,rolloff_mean,rolloff_std,label,fold
0,-131.9536,100.3318,-136.209613,59.544095,-41.002218,18.666155,-29.881801,7.720259,-13.521617,-2.4724,...,0.082744,0.086488,0.103079,0.091772,0.083112,0.0878,4190.474661,789.164506,6,fold1
1,-212.984794,113.986069,-27.193258,22.607542,0.675223,4.756774,15.624546,15.003598,12.439341,11.358548,...,0.235415,0.252663,0.19308,0.21045,0.206592,0.201269,3779.575026,698.880879,2,fold1
2,-58.958201,87.640273,-70.350945,-2.818272,-22.161617,-7.152818,-19.660093,-5.150989,-23.104785,-5.124065,...,0.122456,0.13395,0.129118,0.114089,0.124617,0.122561,4129.396225,773.733583,4,fold1
3,-365.765009,61.840576,8.505344,33.080384,6.873962,-5.327185,-3.849906,-5.238709,-4.314379,-4.959619,...,0.244525,0.233069,0.224638,0.231544,0.270749,0.263057,5021.219348,1082.168871,2,fold1
4,-84.87094,116.204518,-24.585843,9.107674,6.315533,3.426022,4.600642,4.316528,-0.700944,11.300272,...,0.11248,0.111136,0.099302,0.10139,0.095279,0.106619,3822.828136,411.186238,7,fold1


In [216]:
audio_df.label.value_counts()

7    1000
5    1000
4    1000
3    1000
2    1000
9    1000
0    1000
8     929
1     429
6     374
Name: label, dtype: int64

In [217]:
audio_df.fold.value_counts()

fold4     990
fold5     936
fold3     925
fold2     888
fold1     873
fold7     838
fold10    837
fold6     823
fold9     816
fold8     806
Name: fold, dtype: int64

In [218]:
from sklearn.model_selection import GroupKFold

In [219]:
groups = audio_df.loc[:, 'fold']
y = audio_df.loc[:, 'label']
X = audio_df.iloc[:, 0:-2]

**A Note on Train/Test**
The author's of this data set mention that they extracted the initial audio clips from different longer recordings. Thus, it's possible that if you naively shuffled, you could have a clip in the train and a different clip in the test that came from the same original recording. To combat this, they created 10 different folds corresponding to the 10 different original long recordings that were cut into 8,732 pieces. They suggest running 10-fold CV as the main (averaged) metric using the pre-determined folds, which will combat the problem above.

HOWEVER - I'd still like to pursue a modified standard train/validate(in CV loop)/test approach. To do this I propose the following:
1. Create 10 different splits: 1 fold for holdout, 9 folds for training/validation
2. Train 9-fold CV on the remaining data resulting from holding out 1
3. Report average test score. This way, the scored-test data was truly never seen in hyperparameter tuning phase.

Note I did run some initial models using their proposed 10-fold schema without doing this, so I have that here to see if any of this ends up making much of a difference:

Results for non-augmented data set from initial pull (i.e. no pitch-shifting) with StandardScaler on 10-fold CV of pre-determined splits:
1. Logistic Mean accuracy: 0.643 +- 0.036
2. Gaussian NB Mean accuracy: 0.490 +- 0.033
3. SVM (RBF) Mean accuracy: 0.713 +- 0.022

Poorer performance: MinMax scaler, other types of SVM kernels, PCA.

In [220]:
y_te1 = y.iloc[0:873]
y_tr1 = y.iloc[873:]
X_te1 = X.iloc[0:873, :]
X_tr1 = X.iloc[873:, :]
y_te1, y_tr1, X_te1, X_tr1 = np.array(y_te1), np.array(y_tr1), np.array(X_te1), np.array(X_tr1)
groups_2 = groups.drop(groups.index[0:873])

In [221]:
y_te2 = y.iloc[873:1761]
y_tr2 = pd.concat([y.iloc[0:873], y.iloc[1761:]])
X_te2 = X.iloc[873:1761, :]
X_tr2 = X.drop(X.index[873:1761])
y_te2, y_tr2, X_te2, X_tr2 = np.array(y_te2), np.array(y_tr2), np.array(X_te2), np.array(X_tr2)
groups_2 = groups.drop(groups.index[873:1761])

In [222]:
y_te3 = y.iloc[1761:2686]
y_tr3 = pd.concat([y.iloc[0:1761], y.iloc[2686:]])
X_te3 = X.iloc[1761:2686, :]
X_tr3 = X.drop(X.index[1761:2686])
y_te3, y_tr3, X_te3, X_tr3 = np.array(y_te3), np.array(y_tr3), np.array(X_te3), np.array(X_tr3)
groups_3 = groups.drop(groups.index[1761:2686])

In [223]:
y_te4 = y.iloc[3676:4612]
y_tr4 = pd.concat([y.iloc[0:3676], y.iloc[4612:]])
X_te4 = X.iloc[3676:4612, :]
X_tr4 = X.drop(X.index[3676:4612])
y_te4, y_tr4, X_te4, X_tr4 = np.array(y_te4), np.array(y_tr4), np.array(X_te4), np.array(X_tr4)
groups_4 = groups.drop(groups.index[3676:4612])

In [224]:
y_te5 = y.iloc[4612:5434]
y_tr5 = pd.concat([y.iloc[0:4612], y.iloc[5434:]])
X_te5 = X.iloc[4612:5434, :]
X_tr5 = X.drop(X.index[4612:5434])
y_te5, y_tr5, X_te5, X_tr5 = np.array(y_te5), np.array(y_tr5), np.array(X_te5), np.array(X_tr5)
groups_5 = groups.drop(groups.index[4612:5434])

In [225]:
y_te6 = y.iloc[5434:6273]
y_tr6 = pd.concat([y.iloc[0:5434], y.iloc[6273:]])
X_te6 = X.iloc[5434:6273, :]
X_tr6 = X.drop(X.index[5434:6273])
y_te6, y_tr6, X_te6, X_tr6 = np.array(y_te6), np.array(y_tr6), np.array(X_te6), np.array(X_tr6)
groups_6 = groups.drop(groups.index[5434:6273])

In [226]:
y_te7 = y.iloc[6273:7079]
y_tr7 = pd.concat([y.iloc[0:6273], y.iloc[7079:]])
X_te7 = X.iloc[6273:7079, :]
X_tr7 = X.drop(X.index[6273:7079])
y_te7, y_tr7, X_te7, X_tr7 = np.array(y_te7), np.array(y_tr7), np.array(X_te7), np.array(X_tr7)
groups_7 = groups.drop(groups.index[6273:7079])

In [227]:
y_te8 = y.iloc[7079:7895]
y_tr8 = pd.concat([y.iloc[0:7079], y.iloc[7895:]])
X_te8 = X.iloc[7079:7895, :]
X_tr8 = X.drop(X.index[7079:7895])
y_te8, y_tr8, X_te8, X_tr8 = np.array(y_te8), np.array(y_tr8), np.array(X_te8), np.array(X_tr8)
groups_8 = groups.drop(groups.index[7079:7895])

In [228]:
y_te9 = y.iloc[7895:8732]
y_tr9 = pd.concat([y.iloc[0:7895], y.iloc[8732:]])
X_te9 = X.iloc[7895:8732, :]
X_tr9 = X.drop(X.index[7895:8732])
y_te9, y_tr9, X_te9, X_tr9 = np.array(y_te9), np.array(y_tr9), np.array(X_te9), np.array(X_tr9)
groups_9 = groups.drop(groups.index[7895:8732])

In [229]:
group_list = [groups_1, groups_2, groups_3, groups_4, groups_5,
             groups_6, groups_7, groups_8, groups_9]

In [230]:
X_tr_list = [X_tr1, X_tr2, X_tr3, X_tr4, X_tr5, X_tr6, X_tr7, X_tr8, X_tr9]

In [231]:
y_tr_list = [y_tr1, y_tr2, y_tr3, y_tr4, y_tr5, y_tr6, y_tr7, y_tr8, y_tr9]

In [232]:
X_te_list = [X_te1, X_te2, X_te3, X_te4, X_te5, X_te6, X_te7, X_te8, X_te9]
y_te_list = [y_te1, y_te2, y_te3, y_te4, y_te5, y_te6, y_te7, y_te8, y_te9]

## Hyperparameter Tuning - Validation CV

In [235]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

logr_mean_acc, gnb_mean_acc, svm_mean_acc, rfc_mean_acc = [], [], [], []
logr_cv_mean, gnb_cv_mean, svm_cv_mean, rfc_cv_mean = [], [], [], []
logr_cv_std, gnb_cv_std, svm_cv_std, rfc_cv_std = [], [], [], []

for i in range(0,9):
    print(i)
    gkf = GroupKFold(n_splits=9)
    for train, val in gkf.split(X_tr_list[i], y_tr_list[i], groups=group_list[i]):
        X_tr, y_tr = X_tr_list[i][train], y_tr_list[i][train]
        X_val, y_val = X_tr_list[i][val], y_tr_list[i][val]

        scaler = StandardScaler()
        X_tr_scale = scaler.fit_transform(X_tr)
        X_val_scale = scaler.transform(X_val)

        logr = LogisticRegression(C=1000)
        logr.fit(X_tr_scale, y_tr)
        logr_mean_acc.append(logr.score(X_val_scale, y_val))

        svm_rbf = SVC(kernel="rbf")
        svm_rbf.fit(X_tr_scale, y_tr)
        svm_mean_acc.append(svm_rbf.score(X_val_scale, y_val))

        rfc = RandomForestClassifier(n_estimators=128)
        rfc.fit(X_tr_scale, y_tr)
        rfc_mean_acc.append(rfc.score(X_val_scale, y_val))
    
    logr_cv_mean.append(np.mean(logr_mean_acc))
    logr_cv_std.append(np.std(logr_mean_acc))
    svm_cv_mean.append(np.mean(svm_mean_acc))
    svm_cv_std.append(np.std(svm_mean_acc))
    rfc_cv_mean.append(np.mean(rfc_mean_acc))
    rfc_cv_std.append(np.std(rfc_mean_acc))
    
print(f'Logistic Train accuracy: {np.mean(logr_cv_mean):.3f} +- {np.std(logr_cv_std):.3f}')
print(f'SVM Train accuracy: {np.mean(svm_cv_mean):.3f} +- {np.std(svm_cv_std):.3f}')
print(f'Random Forest Train accuracy: {np.mean(rfc_cv_mean):.3f} +- {np.std(rfc_cv_std):.3f}');

0
1
2
3
4
5
6
7
8
Logistic Train accuracy: 0.642 +- 0.002
SVM Train accuracy: 0.708 +- 0.002
Random Forest Train accuracy: 0.688 +- 0.001


## Final Test Loop

In [236]:
logr_test_acc, gnb_test_acc, svm_test_acc, rfc_test_acc = [], [], [], []

for i in range(0,9):
    X_tr, y_tr = X_tr_list[i], y_tr_list[i]
    X_te, y_te = X_te_list[i], y_te_list[i]
    
    scaler = StandardScaler()
    X_tr_scale = scaler.fit_transform(X_tr)
    X_te_scale = scaler.transform(X_te)
    
    logr = LogisticRegression(C=1000)
    logr.fit(X_tr_scale, y_tr)
    logr_test_acc.append(logr.score(X_te_scale, y_te))

    gnb = GaussianNB()
    gnb.fit(X_tr_scale, y_tr)
    gnb_test_acc.append(gnb.score(X_te_scale, y_te))

    svm_rbf = SVC(kernel="rbf")
    svm_rbf.fit(X_tr_scale, y_tr)
    svm_test_acc.append(svm_rbf.score(X_te_scale, y_te))

    rfc = RandomForestClassifier(n_estimators=128)
    rfc.fit(X_tr_scale, y_tr)
    rfc_test_acc.append(rfc.score(X_te_scale, y_te))
    
print(f'Logistic Test accuracy: {np.mean(logr_test_acc):.3f} +- {np.std(logr_test_acc):.3f}')
print(f'GaussianNB Test accuracy: {np.mean(gnb_test_acc):.3f} +- {np.std(gnb_test_acc):.3f}')
print(f'SVM Test accuracy: {np.mean(svm_test_acc):.3f} +- {np.std(svm_test_acc):.3f}')
print(f'Random Forest Test accuracy: {np.mean(rfc_test_acc):.3f} +- {np.std(rfc_test_acc):.3f}');

Logistic Test accuracy: 0.643 +- 0.037
GaussianNB Test accuracy: 0.497 +- 0.026
SVM Test accuracy: 0.713 +- 0.023
Random Forest Test accuracy: 0.696 +- 0.042


In [26]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier

# gkf = GroupKFold(n_splits=10)
# logr_mean_acc, gnb_mean_acc, svm_mean_acc, rfc_mean_acc = [], [], [], []

# for train, test in gkf.split(X, y, groups=groups):
#     X_tr, y_tr = X[train], y[train]
#     X_te, y_te = X[test], y[test]
    
#     scaler = StandardScaler()
#     X_tr_scale = scaler.fit_transform(X_tr)
#     X_te_scale = scaler.transform(X_te)
    
#     logr = LogisticRegression(C=1000)
#     logr.fit(X_tr_scale, y_tr)
#     logr_mean_acc.append(logr.score(X_te_scale, y_te))
    
#     gnb = GaussianNB()
#     gnb.fit(X_tr_scale, y_tr)
#     gnb_mean_acc.append(gnb.score(X_te_scale, y_te))
    
#     svm_rbf = SVC(kernel="rbf")
#     svm_rbf.fit(X_tr_scale, y_tr)
#     svm_mean_acc.append(svm_rbf.score(X_te_scale, y_te))
    
#     rfc = RandomForestClassifier(n_estimators=128)
#     rfc.fit(X_tr_scale, y_tr)
#     rfc_mean_acc.append(rfc.score(X_te_scale, y_te))

    
# print(f'Logistic Mean accuracy: {np.mean(logr_mean_acc):.3f} +- {np.std(logr_mean_acc):.3f}')
# print(f'Gaussian NB Mean accuracy: {np.mean(gnb_mean_acc):.3f} +- {np.std(gnb_mean_acc):.3f}')
# print(f'SVM Mean accuracy: {np.mean(svm_mean_acc):.3f} +- {np.std(svm_mean_acc):.3f}')
# print(f'Random Forest Mean accuracy: {np.mean(rfc_mean_acc):.3f} +- {np.std(rfc_mean_acc):.3f}');

Logistic Mean accuracy: 0.643 +- 0.036
Gaussian NB Mean accuracy: 0.490 +- 0.033
SVM Mean accuracy: 0.713 +- 0.022
Random Forest Mean accuracy: 0.685 +- 0.039


**TODO**: 
1. Present accuracy of different models in terms of boxplots from above
2. Get this in terms of F1, since treats all class labels indifferently.