In [54]:
from numba.core.decorators import jit as optional_jit
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import csv 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import subprocess

In [None]:
#deletes content of a given file
def deleteContent(fName):
    with open(fName, "w"):
        pass

In [56]:
#parses wav files from given directory(directory eval or directory containing non_target/target dirs)
#stores them in a dictionary where key is name of the recording and value is wav file's path
def parse_audio_files(directory):
    sound_files = {}
    for subdirectory in os.listdir(directory):
        if subdirectory.startswith('target') or subdirectory.startswith('non_target'):
            for filename in os.listdir(f''+directory+'/'+subdirectory):
                if filename.endswith('.wav'):
                    prefix = str(filename[0:7])
                    if prefix in sound_files:
                        sound_files[prefix].append(f''+directory+'/'+subdirectory+"/"+filename)
                    else:
                        sound_files[prefix] = [f''+directory+'/'+subdirectory+"/"+filename]
        if directory == 'eval':
            if subdirectory.endswith('.wav'):
                prefix = str(subdirectory[0:8])
                if prefix in sound_files:
                    sound_files[prefix].append(f''+directory+'/'+subdirectory)
                else:
                    sound_files[prefix] = [f''+directory+'/'+subdirectory]
    return sound_files

In [57]:
#get training and evaluating data
train_files = parse_audio_files('train')
eval_files = parse_audio_files('eval')

In [58]:
#extracting features from audio files using librosa library
#features are stored in csv 'dataset_file'
def get_features(sound_files, dataset_file):
    deleteContent(dataset_file)
    #header defines names of columns each representing audio feature/filename
    header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
    for i in range(1, 21):
        header += f' mfcc{i}'
    header += ' person'
    header = header.split()
    #open csv file
    data_file = open(dataset_file, 'w', newline='')
    with data_file:
        writer = csv.writer(data_file)
        writer.writerow(header)
        for key in sound_files.keys():
            for file in sound_files[key]:
                songname = f'{file}'
                y, sr = librosa.load(songname, mono=True, duration=10)
                rmse = librosa.feature.rms(y=y)
                chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
                spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
                spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
                rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
                zcr = librosa.feature.zero_crossing_rate(y)
                mfcc = librosa.feature.mfcc(y=y, sr=sr)
                to_append = f'{file} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
                for e in mfcc:
                    to_append += f' {np.mean(e)}'
                to_append += f' {key}'
                writer.writerow(to_append.split())
    

In [80]:
#divide csv file with features into data and labels
#Train_data consists of audio features
#Train_labels on training data can be 1(target) or 0 (non_target)
#             on eval data only 0
#person_list list of all filenames
def get_set_of_data(searched_person, dataset_file):
    data = pd.read_csv(dataset_file)
    data.head()
    data = data.drop(['filename'],axis=1)
    person_list = data.iloc[:, -1]
    encoder = LabelEncoder()
    #transforming recordings to labels(searched person: 1, else: 0)
    y = []
    for person in person_list:
        if(person.startswith(searched_person)):
            y.append(1)
        else:
            y.append(0)
    Train_labels = np.array(y)
    scaler = StandardScaler()
    Train_data = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
    return Train_data, Train_labels, person_list

In [60]:
get_features(train_files, 'dataset_train2.csv')

In [61]:
get_features(eval_files, 'dataset_eval2.csv')

In [79]:
#visualisation of csv file
data = pd.read_csv('dataset_train2.csv')
data

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,person
0,train/target/m429_02_p04_i0_0.wav,0.437627,0.017108,1133.585287,1342.870442,2204.655269,0.051885,-426.029663,152.454193,-0.387893,...,0.355462,6.232590,-2.498281,4.543469,-1.157447,2.582753,-2.306890,-0.752379,0.139055,m429_02
1,train/target/m429_02_p05_i0_0.wav,0.488476,0.015638,1196.235792,1474.417008,2525.255186,0.045872,-444.292419,136.366821,5.516912,...,4.421031,8.471156,0.243371,4.848452,-1.031055,2.642236,-0.940760,-0.744517,0.921006,m429_02
2,train/target/m429_02_r08_i0_0.wav,0.532765,0.010227,1095.392626,1576.900652,2572.560271,0.034561,-480.528381,128.166275,1.877639,...,1.939414,8.356565,-1.685953,6.528655,-1.978635,5.098059,-0.050975,0.920943,0.786369,m429_02
3,train/target/m429_02_r09_i0_0.wav,0.563168,0.008010,1164.922169,1669.079386,2808.926000,0.031068,-498.470154,118.372551,2.405291,...,3.168785,10.676485,-0.310895,8.069711,-0.620094,5.244664,-0.066192,2.563404,1.509386,m429_02
4,train/target/m429_02_p03_i0_0.wav,0.523891,0.011119,1225.185160,1605.391963,2799.491473,0.042050,-469.367767,126.998581,-3.237647,...,2.665504,9.049703,-2.149390,5.333035,-0.953992,5.498663,-0.468767,1.044797,-0.181955,m429_02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,train/non_target/f409_04_r09_i0_0.wav,0.433719,0.010284,1629.712892,1810.033895,3582.247254,0.069920,-475.415222,107.724800,-4.517683,...,-3.641893,7.991809,-2.306254,4.338410,-5.605099,1.041430,-4.969592,1.086847,-0.679279,f409_04
218,train/non_target/m416_03_r08_i0_0.wav,0.390816,0.022010,1139.500639,1351.845439,2325.061347,0.056134,-412.097778,146.255280,-1.434378,...,-6.124696,1.630619,-3.293826,5.138916,-0.963662,6.283566,-1.863594,-1.032099,0.849649,m416_03
219,train/non_target/m416_03_f19_i0_0.wav,0.393866,0.022176,990.463414,1296.273021,2097.813610,0.040298,-415.935577,148.220032,2.233160,...,-1.865255,4.605335,-1.322394,5.099552,2.599113,2.592728,-2.708725,-1.103034,1.052280,m416_03
220,train/non_target/f409_03_f19_i0_0.wav,0.352207,0.016815,1312.183207,1443.271615,2652.955574,0.062736,-428.817963,129.853821,-14.281751,...,-1.858095,-0.291935,-4.251650,6.764326,-1.781263,-1.137646,-2.254115,-1.329053,-6.488116,f409_03


In [81]:
#training data
Train_data, Train_labels, Person_list = get_set_of_data('m429', 'dataset_train2.csv')

In [82]:
#testing data(unseen)
Eval_data, Eval_labels, Eval_list = get_set_of_data('m429', 'dataset_eval2.csv')

In [100]:
#Model of the neural network
#Multi layer perceptron consisting of 5 dense layers
#Input shape is one-dimensional feature vector that contains 26 features
#Output is 0 or 1 (target/non_target)
model = Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(Train_data.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [102]:
#training of the model
#use of validation_split - validation data(10% of training data), that model is not trained on, but instead evaluates 
#the loss and any model metrics at the end of each epoch.
#use of shuffle - shuffles the training data before each epoch
classifier = model.fit(Train_data, Train_labels, validation_split= 0.10,  epochs=18, batch_size=128, 
                       shuffle=True, verbose=2)

Train on 199 samples, validate on 23 samples
Epoch 1/18
 - 0s - loss: 0.6630 - accuracy: 0.6734 - val_loss: 0.4316 - val_accuracy: 1.0000
Epoch 2/18
 - 0s - loss: 0.5114 - accuracy: 0.8492 - val_loss: 0.2354 - val_accuracy: 1.0000
Epoch 3/18
 - 0s - loss: 0.4049 - accuracy: 0.8492 - val_loss: 0.1168 - val_accuracy: 1.0000
Epoch 4/18
 - 0s - loss: 0.3515 - accuracy: 0.8492 - val_loss: 0.0621 - val_accuracy: 1.0000
Epoch 5/18
 - 0s - loss: 0.3149 - accuracy: 0.8492 - val_loss: 0.0429 - val_accuracy: 1.0000
Epoch 6/18
 - 0s - loss: 0.2659 - accuracy: 0.8492 - val_loss: 0.0413 - val_accuracy: 1.0000
Epoch 7/18
 - 0s - loss: 0.2254 - accuracy: 0.8643 - val_loss: 0.0464 - val_accuracy: 1.0000
Epoch 8/18
 - 0s - loss: 0.1954 - accuracy: 0.9296 - val_loss: 0.0468 - val_accuracy: 1.0000
Epoch 9/18
 - 0s - loss: 0.1714 - accuracy: 0.9447 - val_loss: 0.0418 - val_accuracy: 1.0000
Epoch 10/18
 - 0s - loss: 0.1419 - accuracy: 0.9698 - val_loss: 0.0341 - val_accuracy: 1.0000
Epoch 11/18
 - 0s - loss

In [146]:
#predicts if each record in Data is target or non_target
#Data - records of audio features
#Labels - used for comparison on training data(known labels) 
#List - list of names each representing one recording
def get_predictions(Data, Labels, List):
    scalar = MinMaxScaler()
    scalar.fit(Data)
    bla = scalar.transform(Data)
    #returns hard decision 0 or 1 
    predictions = model.predict_classes(bla,
                    verbose=2,
                    batch_size=len(Data))
    
    probabilities = model.predict_proba(bla,
                    verbose=2,
                    batch_size=len(Data))
    score = 0
    count = 0
    target = 0
    for i in range(len(Data)):
        if(Labels[i] == predictions[i]):
            score += 1
        probability = 0
        if probabilities[i,1] > 0.5:
            probability = probabilities[i,1]
        else:
            probability = probabilities[i,0]
        if(predictions[i]== 1):
            target += 1
            print("%s %s %s\n" % (List[i], probability, predictions[i]))
        count += 1
    print("SCORE=%s out of %s, TARGETS=%s" % (score, count, target))

In [147]:
get_predictions(Eval_data, Eval_labels, Eval_list)

eval_243 0.69315106 1

eval_055 0.53356904 1

eval_135 0.6324268 1

eval_323 0.5566871 1

eval_068 0.6050197 1

eval_242 0.60370344 1

eval_532 0.72776794 1

eval_320 0.52749234 1

eval_533 0.7218764 1

eval_279 0.5176868 1

eval_668 0.6746709 1

eval_520 0.59163636 1

eval_078 0.591121 1

eval_124 0.5973279 1

eval_332 0.6418101 1

eval_125 0.6854841 1

eval_397 0.6655462 1

eval_369 0.57362884 1

eval_155 0.6930598 1

eval_424 0.60904115 1

eval_225 0.6239472 1

eval_345 0.683635 1

eval_351 0.5636303 1

eval_422 0.69088125 1

eval_146 0.576161 1

eval_177 0.56895304 1

eval_349 0.67713565 1

eval_174 0.56489486 1

eval_404 0.5415197 1

eval_165 0.64923126 1

eval_159 0.66726553 1

eval_401 0.52365166 1

eval_366 0.62338245 1

eval_010 0.5675887 1

eval_204 0.65725833 1

eval_629 0.5130777 1

eval_563 0.5066342 1

eval_262 0.513682 1

eval_317 0.6292484 1

eval_129 0.5017138 1

eval_275 0.52195144 1

eval_665 0.51064235 1

eval_466 0.5265213 1

eval_076 0.5257037 1

eval_062 0.525129

In [117]:
get_predictions(Train_data, Train_labels, Person_list)

X=1, Name=m429_02 
 Predicted=1 Probability=0.5892102

X=1, Name=m429_02 
 Predicted=1 Probability=0.7076306

X=1, Name=m429_02 
 Predicted=1 Probability=0.5514344

X=1, Name=m429_02 
 Predicted=1 Probability=0.6406853

X=1, Name=m429_02 
 Predicted=1 Probability=0.5499695

X=1, Name=m429_02 
 Predicted=1 Probability=0.5898103

X=1, Name=m429_02 
 Predicted=1 Probability=0.58548665

X=1, Name=m429_02 
 Predicted=1 Probability=0.55405116

X=1, Name=m429_02 
 Predicted=1 Probability=0.61760294

X=1, Name=m429_01 
 Predicted=1 Probability=0.78405994

X=1, Name=m429_01 
 Predicted=1 Probability=0.5445395

X=1, Name=m429_01 
 Predicted=1 Probability=0.57566124

X=1, Name=m429_01 
 Predicted=1 Probability=0.7093321

X=1, Name=m429_01 
 Predicted=1 Probability=0.6806215

X=1, Name=m429_01 
 Predicted=1 Probability=0.5931771

X=1, Name=m429_01 
 Predicted=1 Probability=0.7694759

X=1, Name=m429_01 
 Predicted=1 Probability=0.66383076

X=1, Name=m429_01 
 Predicted=1 Probability=0.63953364

X=1

In [112]:
data

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,person
0,train/target/m429_02_p04_i0_0.wav,0.437627,0.017108,1133.585287,1342.870442,2204.655269,0.051885,-426.029663,152.454193,-0.387893,...,0.355462,6.232590,-2.498281,4.543469,-1.157447,2.582753,-2.306890,-0.752379,0.139055,m429_02
1,train/target/m429_02_p05_i0_0.wav,0.488476,0.015638,1196.235792,1474.417008,2525.255186,0.045872,-444.292419,136.366821,5.516912,...,4.421031,8.471156,0.243371,4.848452,-1.031055,2.642236,-0.940760,-0.744517,0.921006,m429_02
2,train/target/m429_02_r08_i0_0.wav,0.532765,0.010227,1095.392626,1576.900652,2572.560271,0.034561,-480.528381,128.166275,1.877639,...,1.939414,8.356565,-1.685953,6.528655,-1.978635,5.098059,-0.050975,0.920943,0.786369,m429_02
3,train/target/m429_02_r09_i0_0.wav,0.563168,0.008010,1164.922169,1669.079386,2808.926000,0.031068,-498.470154,118.372551,2.405291,...,3.168785,10.676485,-0.310895,8.069711,-0.620094,5.244664,-0.066192,2.563404,1.509386,m429_02
4,train/target/m429_02_p03_i0_0.wav,0.523891,0.011119,1225.185160,1605.391963,2799.491473,0.042050,-469.367767,126.998581,-3.237647,...,2.665504,9.049703,-2.149390,5.333035,-0.953992,5.498663,-0.468767,1.044797,-0.181955,m429_02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,train/non_target/f409_04_r09_i0_0.wav,0.433719,0.010284,1629.712892,1810.033895,3582.247254,0.069920,-475.415222,107.724800,-4.517683,...,-3.641893,7.991809,-2.306254,4.338410,-5.605099,1.041430,-4.969592,1.086847,-0.679279,f409_04
218,train/non_target/m416_03_r08_i0_0.wav,0.390816,0.022010,1139.500639,1351.845439,2325.061347,0.056134,-412.097778,146.255280,-1.434378,...,-6.124696,1.630619,-3.293826,5.138916,-0.963662,6.283566,-1.863594,-1.032099,0.849649,m416_03
219,train/non_target/m416_03_f19_i0_0.wav,0.393866,0.022176,990.463414,1296.273021,2097.813610,0.040298,-415.935577,148.220032,2.233160,...,-1.865255,4.605335,-1.322394,5.099552,2.599113,2.592728,-2.708725,-1.103034,1.052280,m416_03
220,train/non_target/f409_03_f19_i0_0.wav,0.352207,0.016815,1312.183207,1443.271615,2652.955574,0.062736,-428.817963,129.853821,-14.281751,...,-1.858095,-0.291935,-4.251650,6.764326,-1.781263,-1.137646,-2.254115,-1.329053,-6.488116,f409_03


In [113]:
#saves the model with all trained weights locally
model.save("39TargetsEvalAUDIO.h5")