In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import pickle

In [69]:
with open('raw_df.pkl', 'rb') as f:
    audio_df = pickle.load(f)

In [70]:
audio_df.head()

Unnamed: 0,mfcc_mean1,mfcc_mean2,mfcc_mean3,mfcc_mean4,mfcc_mean5,mfcc_mean6,mfcc_mean7,mfcc_mean8,mfcc_mean9,mfcc_mean10,...,chroma_std7,chroma_std8,chroma_std9,chroma_std10,chroma_std11,chroma_std12,rolloff_mean,rolloff_std,label,fold
0,-131.9536,100.3318,-136.209613,59.544095,-41.002218,18.666155,-29.881801,7.720259,-13.521617,-2.4724,...,0.082744,0.086488,0.103079,0.091772,0.083112,0.0878,4190.474661,789.164506,6,fold1
1,-212.984794,113.986069,-27.193258,22.607542,0.675223,4.756774,15.624546,15.003598,12.439341,11.358548,...,0.235415,0.252663,0.19308,0.21045,0.206592,0.201269,3779.575026,698.880879,2,fold1
2,-58.958201,87.640273,-70.350945,-2.818272,-22.161617,-7.152818,-19.660093,-5.150989,-23.104785,-5.124065,...,0.122456,0.13395,0.129118,0.114089,0.124617,0.122561,4129.396225,773.733583,4,fold1
3,-365.765009,61.840576,8.505344,33.080384,6.873962,-5.327185,-3.849906,-5.238709,-4.314379,-4.959619,...,0.244525,0.233069,0.224638,0.231544,0.270749,0.263057,5021.219348,1082.168871,2,fold1
4,-84.87094,116.204518,-24.585843,9.107674,6.315533,3.426022,4.600642,4.316528,-0.700944,11.300272,...,0.11248,0.111136,0.099302,0.10139,0.095279,0.106619,3822.828136,411.186238,7,fold1


In [71]:
audio_df.label.value_counts()

7    1000
5    1000
4    1000
3    1000
2    1000
9    1000
0    1000
8     929
1     429
6     374
Name: label, dtype: int64

In [72]:
audio_df.fold.value_counts()

fold4     990
fold5     936
fold3     925
fold2     888
fold1     873
fold7     838
fold10    837
fold6     823
fold9     816
fold8     806
Name: fold, dtype: int64

In [73]:
from sklearn.model_selection import GroupKFold

In [74]:
groups = audio_df.loc[:, 'fold']
y = audio_df.loc[:, 'label']
X = audio_df.iloc[:, 0:-2]
X, y = np.array(X), np.array(y)

Quick note on adding in extra validation complexity: "For training the proposed CNN architecture we use 1 of the 9 training folds in each split as a validation set for identifying the training epoch that yields the best model parameters when training with the remaining 8 folds." See [here](https://arxiv.org/pdf/1608.04363.pdf).

Note the results for non-augmented data set from initial pull (i.e. no pitch-shifting):
1. Logistic Mean accuracy: 0.643 +- 0.036
2. Gaussian NB Mean accuracy: 0.490 +- 0.033
3. SVM (RBF) Mean accuracy: 0.713 +- 0.022

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

gkf = GroupKFold(n_splits=10)
logr_mean_acc, gnb_mean_acc, svm_mean_acc = [], [], []
for train, test in gkf.split(X, y, groups=groups):
    X_tr, y_tr = X[train], y[train]
    X_te, y_te = X[test], y[test]
    
    scaler = StandardScaler()
    X_tr_scale = scaler.fit_transform(X_tr)
    X_te_scale = scaler.transform(X_te)
    logr = LogisticRegression(C=1000)
    logr.fit(X_tr_scale, y_tr)
    logr_mean_acc.append(logr.score(X_te_scale, y_te))
    
    gnb = GaussianNB()
    gnb.fit(X_tr_scale, y_tr)
    gnb_mean_acc.append(gnb.score(X_te_scale, y_te))
    
    svm_rbf = SVC(kernel="rbf")
    svm_rbf.fit(X_tr_scale, y_tr)
    svm_mean_acc.append(svm_rbf.score(X_te_scale, y_te))

    
print(f'Logistic Mean accuracy: {np.mean(logr_mean_acc):.3f} +- {np.std(logr_mean_acc):.3f}')
print(f'Gaussian NB Mean accuracy: {np.mean(gnb_mean_acc):.3f} +- {np.std(gnb_mean_acc):.3f}')
print(f'SVM Mean accuracy: {np.mean(svm_mean_acc):.3f} +- {np.std(svm_mean_acc):.3f}')

Logistic Mean accuracy: 0.643 +- 0.036
Gaussian NB Mean accuracy: 0.490 +- 0.033
SVM Mean accuracy: 0.713 +- 0.022


**TODO**: Present accuracy of different models in terms of boxplots from above