In [152]:
import os

import numpy as np
import pandas as pd

import glob

import time
from tqdm import tqdm

import librosa
import librosa.display
import soundfile as sf # librosa fails when reading files on Kaggle.


import matplotlib.pyplot as plt
import IPython.display as ipd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [153]:
file_names = glob.glob("./AnnotatedAudioFiles/*.wav")
file_names = sorted(file_names)

In [154]:
print(file_names)

['./AnnotatedAudioFiles/code0_dating-violence_01.wav', './AnnotatedAudioFiles/code0_dating-violence_02.wav', './AnnotatedAudioFiles/code0_dating-violence_03.wav', './AnnotatedAudioFiles/code0_dating-violence_04.wav', './AnnotatedAudioFiles/code0_dating-violence_05.wav', './AnnotatedAudioFiles/code0_dating-violence_06.wav', './AnnotatedAudioFiles/code0_dating-violence_07.wav', './AnnotatedAudioFiles/code0_dating-violence_08.wav', './AnnotatedAudioFiles/code0_dating-violence_09.wav', './AnnotatedAudioFiles/code0_dating-violence_10.wav', './AnnotatedAudioFiles/code0_dating-violence_11.wav', './AnnotatedAudioFiles/code0_dating-violence_12.wav', './AnnotatedAudioFiles/code0_dating-violence_13.wav', './AnnotatedAudioFiles/code0_dating-violence_14.wav', './AnnotatedAudioFiles/code0_dating-violence_15.wav', './AnnotatedAudioFiles/code0_dating-violence_16.wav', './AnnotatedAudioFiles/code0_dating-violence_17.wav', './AnnotatedAudioFiles/code0_dating-violence_18.wav', './AnnotatedAudioFiles/code

In [234]:
to_df = []
for f in file_names:
    tmp = f.split('/')[2].split('_')[0]
    if tmp in ('code0'):
        to_df.append([f, int(tmp[-1]), 'violence', 1])
    else:
        to_df.append([f, int(tmp[-1]), 'normal', 0])

In [235]:
df = pd.DataFrame(to_df, columns=['path', 'code', 'violence', 'label'])

In [236]:
df

Unnamed: 0,path,code,violence,label
0,./AnnotatedAudioFiles/code0_dating-violence_01...,0,violence,1
1,./AnnotatedAudioFiles/code0_dating-violence_02...,0,violence,1
2,./AnnotatedAudioFiles/code0_dating-violence_03...,0,violence,1
3,./AnnotatedAudioFiles/code0_dating-violence_04...,0,violence,1
4,./AnnotatedAudioFiles/code0_dating-violence_05...,0,violence,1
...,...,...,...,...
1667,./AnnotatedAudioFiles/code5_normal_3(time9000t...,5,normal,0
1668,./AnnotatedAudioFiles/code5_normal_3(time900to...,5,normal,0
1669,./AnnotatedAudioFiles/code5_normal_3(time9300t...,5,normal,0
1670,./AnnotatedAudioFiles/code5_normal_3(time9600t...,5,normal,0


In [237]:
df['violence'].value_counts()

normal      1244
violence     428
Name: violence, dtype: int64

In [159]:
def mean_mfccs(x):
    return [np.mean(feature) for feature in librosa.feature.mfcc(x)]

def parse_audio(x):
    return x.flatten('F')[:x.shape[0]] 

def get_audios():
    train_file_names = df['path']
    samples = []
    for file_name in tqdm(train_file_names):
        x, sr = sf.read(file_name, always_2d=True)
        if not len(x) == 0:
            x = parse_audio(x)
            samples.append(mean_mfccs(x))
        else:
            print(file_name)
    return np.array(samples)

def get_samples():
    return get_audios()

In [160]:
X = get_samples()

100%|██████████| 1672/1672 [07:05<00:00,  3.93it/s]


In [238]:
Y = df['code'].values

In [239]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42)

In [240]:
print(f'Shape: {x_train.shape}')
print(f'Observation: \n{x_train[0]}')
print(f'Labels: {y_train}')

Shape: (1254, 20)
Observation: 
[-239.98212318  148.60842833  -54.19502976   36.05074913  -11.94497875
  -10.65783343   19.2374783   -22.70349071   10.92902691    7.32373047
   -9.52565376   12.04628607   -6.07585011    0.61865474    4.04895975
   -6.96640298    6.52368805   -3.64322333   -3.59081268    3.1996127 ]
Labels: [1 2 2 ... 2 3 2]


In [241]:
from collections import Counter
counter = Counter(y_train)
print(counter)

Counter({2: 333, 0: 317, 1: 269, 3: 194, 5: 141})


In [242]:
# scaler = StandardScaler()
# scaler.fit(x_train)
# x_train_scaled = scaler.transform(x_train)
# x_test_scaled = scaler.transform(x_test)

# pca = PCA().fit(x_train_scaled)

# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)')
# plt.show()

In [243]:
grid_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

model = GridSearchCV(KNeighborsClassifier(), grid_params, cv=5, n_jobs=-1)
model.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9, 11, 15],
                         'weights': ['uniform', 'distance']})

In [244]:
print(f'Model Score: {model.score(x_test, y_test)}')
>>> Model Score: 0.9497607655502392 
y_predict = model.predict(x_test)
print(f'Confusion Matrix: \n{confusion_matrix(y_predict, y_test)}')

Model Score: 0.9497607655502392
Confusion Matrix: 
[[106   0   0   0   0]
 [  2  87   7   1   0]
 [  3   1  92   7   0]
 [  0   0   0  70   0]
 [  0   0   0   0  42]]


In [245]:
set(y_predict)

{0, 1, 2, 3, 5}

In [246]:
model.best_params_

{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}

In [247]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=5, metric='manhattan', weights='distance')
kn.fit(x_train, y_train)
print(kn.score(x_test, y_test))
>>> 0.9497607655502392
y_predict = kn.predict(x_test)
print(f'Confusion Matrix: \n{confusion_matrix(y_predict, y_test)}')

0.9497607655502392
Confusion Matrix: 
[[106   0   0   0   0]
 [  2  87   7   1   0]
 [  3   1  92   7   0]
 [  0   0   0  70   0]
 [  0   0   0   0  42]]


In [248]:
import joblib

In [249]:
joblib.dump(kn, 'knn_code_model_code.pkl')

['knn_code_model_code.pkl']