In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tensorflow import keras
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Input, Dropout, BatchNormalization

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import joblib
from tensorflow.keras.models import load_model

# 1 - Load dataset

In [5]:
data = pd.read_csv('./Data/final_dataset.csv')
data.head()

Unnamed: 0,id,seen_by_model,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label,createdAt,updatedAt,deletedAt
0,9470,1,0.441011,0.098403,0.308838,0.010569,3328.124539,810754.9,3382.69156,114583.284474,...,-0.968601,87.470032,2.991356,59.765133,-1.670792,59.670425,7,2021-11-22 17:52:43,2021-11-22 17:52:43,
1,9471,1,0.370265,0.083063,0.171207,0.006743,2915.715009,1556398.0,2599.689672,324970.562618,...,5.818873,138.011856,4.596396,144.546539,1.729125,211.818741,7,2021-11-22 17:52:43,2021-11-22 17:52:43,
2,9472,1,0.385647,0.071537,0.229402,0.015564,3880.270183,2722230.0,3186.927682,254165.389259,...,2.303481,64.394051,0.785539,59.500866,-2.959064,73.443031,7,2021-11-22 17:52:43,2021-11-22 17:52:43,
3,9473,1,0.407606,0.107868,0.079086,0.000538,2271.813617,639197.8,2707.634095,187545.070021,...,-3.473018,122.847267,-2.771165,68.30822,-0.633971,63.867836,7,2021-11-22 17:52:43,2021-11-22 17:52:43,
4,9474,1,0.397764,0.07233,0.242004,0.005612,2911.060481,548835.0,2956.11896,145420.851816,...,3.052081,117.361862,-1.91587,47.333481,2.078118,60.737377,7,2021-11-22 17:52:43,2021-11-22 17:52:43,


In [6]:
data.columns

Index(['id', 'seen_by_model', 'chroma_stft_mean', 'chroma_stft_var',
       'rms_mean', 'rms_var', 'spectral_centroid_mean',
       'spectral_centroid_var', 'spectral_bandwidth_mean',
       'spectral_bandwidth_var', 'rolloff_mean', 'rolloff_var',
       'zero_crossing_rate_mean', 'zero_crossing_rate_var', 'harmony_mean',
       'harmony_var', 'tempo', 'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean',
       'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 'mfcc4_mean', 'mfcc4_var',
       'mfcc5_mean', 'mfcc5_var', 'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean',
       'mfcc7_var', 'mfcc8_mean', 'mfcc8_var', 'mfcc9_mean', 'mfcc9_var',
       'mfcc10_mean', 'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
       'mfcc12_var', 'mfcc13_mean', 'mfcc13_var', 'mfcc14_mean', 'mfcc14_var',
       'mfcc15_mean', 'mfcc15_var', 'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean',
       'mfcc17_var', 'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean', 'mfcc19_var',
       'mfcc20_mean', 'mfcc20_var', 'label', 'createdAt', 'upda

In [7]:
drop_layers = ['id', 'seen_by_model', 'createdAt', 'updatedAt', 'deletedAt']
data = data.drop(columns=drop_layers)
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.441011,0.098403,0.308838,0.010569,3328.124539,810754.9,3382.69156,114583.284474,7897.323031,1917331.0,...,42.023411,-5.012711,66.960938,-0.968601,87.470032,2.991356,59.765133,-1.670792,59.670425,7
1,0.370265,0.083063,0.171207,0.006743,2915.715009,1556398.0,2599.689672,324970.562618,5661.195497,5154728.0,...,94.394127,1.313031,144.447861,5.818873,138.011856,4.596396,144.546539,1.729125,211.818741,7
2,0.385647,0.071537,0.229402,0.015564,3880.270183,2722230.0,3186.927682,254165.389259,7703.815192,5101757.0,...,89.115379,5.156198,82.11869,2.303481,64.394051,0.785539,59.500866,-2.959064,73.443031,7
3,0.407606,0.107868,0.079086,0.000538,2271.813617,639197.8,2707.634095,187545.070021,5239.177972,3272871.0,...,58.049915,-11.866402,100.287689,-3.473018,122.847267,-2.771165,68.30822,-0.633971,63.867836,7
4,0.397764,0.07233,0.242004,0.005612,2911.060481,548835.0,2956.11896,145420.851816,6284.660711,2669257.0,...,50.164764,3.50523,64.7435,3.052081,117.361862,-1.91587,47.333481,2.078118,60.737377,7


In [15]:
corr = data.corr().label
d = corr[(corr < -0.05) | (corr >= 0)]
not_d_cols = list(corr[(corr > -0.1) & (corr < 0.1)].index) + ['label']
print(len(d))

51


# 2 - Pre-process data

In [16]:
data = data.sample(frac=1)

X = data.drop(columns=not_d_cols)
y = data.label

In [17]:
X.head()

Unnamed: 0,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,zero_crossing_rate_mean,zero_crossing_rate_var,...,mfcc8_mean,mfcc9_mean,mfcc10_mean,mfcc11_mean,mfcc12_mean,mfcc13_var,mfcc14_mean,mfcc16_mean,mfcc18_mean,mfcc20_var
11293,0.405778,0.137611,2778.788868,1006514.0,2729.904294,144891.922787,5906.192902,2679924.0,0.122871,0.107774,...,6.060346,5.871454,6.01303,-4.361856,8.058302,100.39476,9.925915,4.862923,3.597121,79.213356
12471,0.11713,0.159906,912.852102,18683.48,1249.50242,19498.686402,1625.357148,110360.8,0.04429,0.042329,...,3.412008,-2.406624,-3.820136,-12.571994,-4.409249,34.084389,-0.867796,-4.911867,-3.018001,22.610453
16297,0.310476,0.077152,1508.299992,92494.21,1888.571698,98090.452026,3129.508841,588767.6,0.065882,0.061542,...,4.498883,-7.883408,8.13259,-7.473206,0.264852,52.306938,-3.762148,3.227815,2.629616,63.270176
4512,0.144873,0.157399,2345.659311,180851.1,2285.741002,54629.904141,5062.339659,620310.9,0.123477,0.10823,...,19.18622,-19.09424,11.536619,-8.293885,17.819607,58.953903,1.110843,5.375949,6.028002,71.465797
17175,0.522142,0.105582,1198.589771,208658.2,1870.436167,178024.577045,2363.231572,1942687.0,0.039764,0.038183,...,1.919226,-7.23874,3.190215,-5.991389,2.171602,32.406166,5.710892,2.922934,5.966251,56.034161


In [19]:
count = []
tdf = pd.DataFrame(data=y, columns=['label'])

for i in range(10):
    count.append(len(tdf[tdf.label == i]))
    
count

[1450, 2994, 1396, 2423, 1919, 2237, 1529, 1475, 1550, 1201]

## 2.1 - Scale data

In [22]:
final_scaler = StandardScaler()
final_scaler.fit(X)
X_scaled = pd.DataFrame(data=final_scaler.transform(X), index=X.index, columns=X.columns)

X_scaled.head()

Unnamed: 0,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,zero_crossing_rate_mean,zero_crossing_rate_var,...,mfcc8_mean,mfcc9_mean,mfcc10_mean,mfcc11_mean,mfcc12_mean,mfcc13_var,mfcc14_mean,mfcc16_mean,mfcc18_mean,mfcc20_var
11293,0.185951,0.02426,0.80267,1.225241,0.813792,0.023571,0.822095,0.525752,0.581472,0.648594,...,-0.056145,1.354857,0.047227,-0.004411,0.686255,0.990533,1.444643,0.529465,0.389622,0.359136
12471,-1.840819,0.289567,-1.523312,-0.95386,-1.564394,-0.931789,-1.57556,-1.074266,-1.1768,-1.257116,...,-0.333932,0.28599,-1.195425,-1.238491,-1.187346,-0.705532,-0.492252,-1.374192,-0.979614,-0.908283
16297,-0.483221,-0.695175,-0.781057,-0.791038,-0.537764,-0.333005,-0.733099,-0.776371,-0.693674,-0.697647,...,-0.219929,-0.421173,0.315083,-0.472083,-0.484931,-0.239441,-1.011634,0.211025,0.189362,0.002146
4512,-1.64602,0.259733,0.262753,-0.596127,0.100267,-0.664127,0.349461,-0.75673,0.595036,0.661897,...,1.320639,-1.868717,0.745263,-0.595441,2.153166,-0.069428,-0.137192,0.629378,0.89278,0.185657
17175,1.003007,-0.35687,-1.167126,-0.534786,-0.566897,0.276006,-1.162283,0.066689,-1.278073,-1.377834,...,-0.490511,-0.337933,-0.309502,-0.249349,-0.198388,-0.748457,0.688271,0.151649,0.879998,-0.159879


In [223]:
X_test.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc8_mean,mfcc8_var,mfcc9_mean,mfcc10_mean,mfcc11_mean,mfcc12_mean,mfcc14_mean,mfcc16_mean,mfcc18_mean,mfcc20_mean
0,1.038016,0.437375,-0.018693,-0.420876,0.399244,1.272321,0.839106,0.383064,0.647317,1.180482,...,0.088448,0.148624,-0.852598,0.274901,0.151997,0.696389,0.877523,0.104179,-0.14586,0.493358
1,-0.804889,0.660655,-0.878917,-0.633717,-1.700373,-1.021189,-2.176274,-1.018944,-1.669184,-1.179531,...,-1.511542,-0.91468,-0.964445,-1.455015,0.119247,-1.286048,-0.54693,-1.381066,-1.673007,-0.836715
2,-1.025851,-0.40633,-0.731424,-0.505002,1.414666,0.245237,0.270836,-0.451969,0.860895,-0.384619,...,1.552458,0.468918,-1.405738,1.571759,-0.021693,-0.23566,-1.539155,-1.170925,0.308637,0.909053
3,-0.717744,-0.664326,-0.17544,-0.534129,0.348697,-0.076541,0.637386,0.040909,0.488984,0.275458,...,0.286607,-0.630285,0.885885,0.356427,-0.291013,0.290349,0.041648,0.068601,0.627952,1.063232
4,-1.784929,-0.545532,-1.052454,-0.538131,-1.900019,-1.035448,-2.67512,-1.086872,-1.926208,-1.201716,...,-1.224558,-0.747721,-0.056967,-2.124992,-1.764973,-2.225508,-0.10294,-0.209421,-0.592426,1.118572


## 2.2 - Detect and handle outliers

In [8]:
#outliers = {}
threshold = 3 # 3 standard deviations

for feature in data.drop(columns='label').columns:
    median = data[feature].median()
    std = data[feature].std()

    #outliers = training_set[(abs(training_set[feature] - median) / std) > threshold].index
    
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1

    [MnIF, MxIF] = [Q1-1.5*IQR, Q3+1.5*IQR]
    [MnOF, MxOF] = [Q1-3*IQR, Q3+3*IQR]

    upper_outliers = data[data[feature] >= MxIF].index
    lower_outliers = data[data[feature] <= MnIF].index

    data.loc[data.index.isin(upper_outliers), feature] = data[feature].mean()
    data.loc[data.index.isin(lower_outliers), feature] = data[feature].mean()
    
    
    # Mean imputation
    #training_set.loc[training_set.index.isin(outliers), feature] = training_set[feature].mean()
    # Median imputation
    #training_set.loc[training_set.index.isin(outliers), feature] = training_set[feature].mean()
    # Mean imputation
    #training_set.loc[training_set.index.isin(outliers), feature] = training_set[feature].mean()


# 4 - Train final models

In [236]:
X.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc8_mean,mfcc8_var,mfcc9_mean,mfcc10_mean,mfcc11_mean,mfcc12_mean,mfcc14_mean,mfcc16_mean,mfcc18_mean,mfcc20_mean
5035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2552,0.222058,0.02972,0.260106,0.001125,2361.785928,395973.870836,2451.662615,160860.568466,5195.896965,2008947.0,...,12.749134,57.742821,-10.305162,8.566433,-0.597312,-1.269005,3.08984,2.168136,5.157101,7.099347
306,0.432984,0.116189,0.140183,0.001239,2522.644299,721467.30223,2483.697979,225216.839865,5116.209125,3071948.0,...,5.278366,57.57169,-6.27073,-0.955436,-11.8343,-4.025055,-6.103829,-4.921802,-1.081177,1.801386
1655,0.209843,0.035562,0.157989,0.000431,2533.643748,60045.283529,2436.029634,29063.936845,5117.657995,341132.8,...,10.422565,40.736332,2.626701,10.140564,2.578638,1.709377,4.367891,7.870467,8.787485,5.467753
7985,0.370385,0.121341,0.045105,0.00042,1705.805875,197029.361958,1804.509314,72157.000897,3373.243717,702802.0,...,-0.161508,109.61171,-5.889575,-3.246834,-10.343292,1.974046,-0.054398,0.725202,-3.010236,5.518209


In [10]:
X = data.drop(columns='label')
y = data.label

final_scaler = StandardScaler()
final_scaler.fit(X)
X_scaled = pd.DataFrame(data=final_scaler.transform(X), columns=X.columns)
X_scaled.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.480771,0.617136,0.176931,0.642355,1.513473,1.245885,1.996466,-0.125414,1.937967,0.129818,...,-1.889295,-0.346978,-0.348972,0.684966,-0.65736,1.539252,1.512059,0.215987,-0.612374,0.098847
1,-0.040184,0.18764,0.722057,0.642355,0.992917,0.164461,0.597723,2.121438,0.685216,2.434493,...,-1.1256,1.889008,1.179713,0.192624,0.992715,0.226831,1.943037,0.203691,0.167522,0.218413
2,0.073089,-0.135078,1.612301,0.642355,2.210409,0.164461,1.646756,1.365267,1.829558,2.396783,...,-0.0036,1.66363,2.108457,1.314194,0.138102,0.575868,0.919763,0.206027,-0.907886,0.568818
3,0.234783,0.88214,-0.687178,-0.824666,0.180164,0.730059,0.790553,0.653789,0.448788,1.094815,...,-1.020637,0.337279,-2.005242,2.068425,-1.266199,0.226831,-0.035267,0.537971,-0.374541,0.242078
4,0.16231,-0.112888,1.80508,2.639717,0.987042,0.458362,1.234443,0.203919,1.034501,0.665107,...,-0.399448,0.00062,1.709482,0.592915,0.320092,2.787186,0.194393,-0.252555,0.247577,0.135255


In [241]:
final_scaler.n_features_in_

35

In [230]:
len(X.columns)

35

## 4.1 - KNN

In [11]:
final_knn_model = KNeighborsClassifier(n_neighbors=2, weights='distance', p=1)
knn_pipeline = make_pipeline(StandardScaler(), final_knn_model)

knn_pipeline.fit(X, y)
accuracy_score(y, knn_pipeline.predict(X))

0.9996698580389568

In [12]:
# Save model
knn_model_file = "./model-server/ml-models/knn_v3.pkl"  
joblib.dump(knn_pipeline, knn_model_file)

['./model-server/ml-models/knn_v3.pkl']

## 4.2 - Neural Network

In [131]:
class NNWrapper:
    def __init__(self):
        self.model = keras.Sequential([
            Dense(600, activation='relu', input_shape=[len(X_train.columns)]),
            Dropout(0.5),
            Dense(300, activation='relu'),
            Dropout(0.5),
            Dense(128, activation='relu'),
            Dropout(0.4),
            Dense(10, activation='softmax')
        ])
        self.model.compile(metrics=['accuracy'], loss='sparse_categorical_crossentropy',
            optimizer='adam')
        
    def fit(self, X, y, X_val, y_val):
        early_stopping = EarlyStopping(
            patience=10,
            min_delta=0.001,
            restore_best_weights=True
        )
        return self.model.fit(
            X, y,
            validation_data=(X_val, y_val),
            epochs=1000,
            callbacks=[early_stopping],
            verbose=1)
    
    def predict(self, X):
        nn_prediction_distributions = self.model.predict(X)
        nn_prediction = []
        for i in range(len(nn_prediction_distributions)):
            nn_prediction.append(np.argmax(nn_prediction_distributions[i]))
            
        return nn_prediction
        

In [232]:
_X_scaled, X_val, _y_scaled, y_val = train_test_split(X_scaled, y, test_size=0.2)

nn_model = NNWrapper()

nn_model.fit(_X_scaled, _y_scaled, X_val, y_val)
nn_prediction = nn_model.predict(X_scaled)
 
#nn_pipeline = make_pipeline(StandardScaler(), nn_model)
#nn_pipeline.fit(X, y, nnwrapper__X_val=X, nnwrapper__y_val=y)
#nn_prediction = nn_pipeline.predict(X)
    
nn_acc = accuracy_score(y, nn_prediction)
print("Accuracy: " + str(nn_acc))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Accuracy: 0.9576327617779258


In [242]:
# Save model
scaler_filename = "./model-server/ml-models/scaler_v2.pkl"  
neural_net_model_file = "./model-server/ml-models/neural_net_v4.h5"  
joblib.dump(final_scaler, scaler_filename)
nn_model.model.save(neural_net_model_file)