In [1]:
import pandas as pd
import numpy as np
import math as math

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.utils import shuffle
from sklearn import metrics

import matplotlib.pyplot as plt

from sklearn.metrics import balanced_accuracy_score


In [2]:
data_source_path = "./source/"
data_source_files = ["Female-PTDB.csv", "Female-TMIT.csv", "Male-PTDB.csv", "Male-TMIT.csv"]

data_source_files_with_noise = ["Noizeus-Babble.csv", "Noizeus-Car.csv", "Noizeus-NoNoise.csv", 
                                "Noizeus-Restaurant.csv", "Noizeus-Station.csv", "Noizeus-Street.csv", 
                                "Noizeus-Train.csv"]





In [3]:
# load datasets
n_babble_df = pd.read_csv(data_source_path + data_source_files_with_noise[0]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_car_df = pd.read_csv(data_source_path + data_source_files_with_noise[1]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_nonoise_df = pd.read_csv(data_source_path + data_source_files_with_noise[2]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_restaurant_df = pd.read_csv(data_source_path + data_source_files_with_noise[3]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_station_df = pd.read_csv(data_source_path + data_source_files_with_noise[4]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_street_df = pd.read_csv(data_source_path + data_source_files_with_noise[5]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
n_train_df = pd.read_csv(data_source_path + data_source_files_with_noise[6]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})

# combine datasets
full_withnoise_df = pd.concat([n_babble_df, n_car_df, n_nonoise_df, n_restaurant_df, n_station_df, n_street_df, n_train_df], ignore_index=True)
full_withnoise_df


Unnamed: 0,zcr,rms,spectral_cetroid,spectral_rolloff,mfcc 1,mfcc 2,mfcc 3,mfcc 4,mfcc 5,mfcc 6,mfcc 7,label
0,0.067480,0.014003,1252.072042,2353.579102,-400.27832,235.95964,-106.796005,0.056166,27.075779,-56.186157,-33.089775,False
1,0.056543,0.014956,1108.341028,2084.414063,-413.85336,264.32867,-105.791240,-4.294174,28.579357,-59.673830,-22.032665,False
2,0.046680,0.018747,1042.511071,1929.375000,-366.19098,220.37051,-78.318375,-7.798753,4.226619,-49.197327,-27.135250,False
3,0.039355,0.028922,965.001824,1914.301758,-368.97647,253.92090,-81.486330,-7.253069,31.278433,-45.258450,-23.949402,False
4,0.041016,0.027191,997.280091,1937.988281,-348.39618,230.37402,-76.895370,-3.285618,12.548368,-41.711308,-21.581696,False
...,...,...,...,...,...,...,...,...,...,...,...,...
12147,0.119141,0.020306,1789.718774,2829.462891,-343.20410,213.72339,-142.700620,10.520731,26.537085,-65.996840,-20.455010,False
12148,0.121777,0.025095,1762.664725,2790.703125,-353.88763,253.22595,-161.726970,13.808258,39.321660,-77.625656,-15.114021,False
12149,0.118555,0.029932,1834.853679,2846.689453,-278.39526,188.48898,-126.913960,2.296159,23.908407,-60.665160,-26.359623,False
12150,0.105176,0.032779,1699.167987,2741.176758,-310.85983,231.20401,-145.085710,5.652969,33.587585,-71.358570,-27.630926,False


In [4]:
# load datasets
female_ptdb_df = pd.read_csv(data_source_path + data_source_files[0]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
female_tmit_df = pd.read_csv(data_source_path + data_source_files[1]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
male_ptdb_df = pd.read_csv(data_source_path + data_source_files[2]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})
male_tmit_df = pd.read_csv(data_source_path + data_source_files[3]).iloc[:, 1:].rename(columns = {'ground_truth':'label'})


# combine datasets
full_nonoise_df = pd.concat([female_ptdb_df, female_tmit_df, male_ptdb_df, male_tmit_df], ignore_index=True)
full_nonoise_df

Unnamed: 0,zcr,rms,spectral_cetroid,spectral_rolloff,mfcc 1,mfcc 2,mfcc 3,mfcc 4,mfcc 5,mfcc 6,mfcc 7,label
0,0.078906,0.000226,3894.172396,7960.825195,-679.62380,34.850086,16.685300,22.100058,7.687350,14.396711,-2.478875,False
1,0.132422,0.000195,3818.758658,7913.452148,-676.12805,41.850094,18.115484,18.258718,5.341282,17.687515,1.568857,False
2,0.149023,0.000154,3667.845026,7952.211914,-680.80066,53.271057,16.051815,15.571219,7.883388,13.552220,4.892397,False
3,0.143066,0.000153,3638.891936,7934.985352,-682.80680,53.315132,15.652989,15.850302,9.741061,14.765318,5.449076,False
4,0.100879,0.000202,3477.161682,7797.172852,-683.41693,51.226890,18.926594,20.186535,6.267492,13.525415,7.289101,False
...,...,...,...,...,...,...,...,...,...,...,...,...
56105,0.098730,0.000358,2081.511061,4500.439453,-638.57650,112.903730,-30.076542,27.566034,-18.518550,14.489972,-4.514703,False
56106,0.109082,0.000305,2281.715277,4289.414062,-649.44460,106.873000,-41.345960,22.769505,-27.545710,14.155927,4.928344,False
56107,0.112109,0.000119,2578.327334,5320.854492,-712.06320,90.672500,-15.080589,18.369724,-23.264654,20.386620,-1.585826,False
56108,0.108594,0.000069,2910.688507,6199.409180,-754.02630,78.074430,-7.041000,15.133078,-23.212465,14.904060,-7.863165,False


In [5]:
full_df = pd.concat([full_withnoise_df, full_nonoise_df], ignore_index=True)
full_df



Unnamed: 0,zcr,rms,spectral_cetroid,spectral_rolloff,mfcc 1,mfcc 2,mfcc 3,mfcc 4,mfcc 5,mfcc 6,mfcc 7,label
0,0.067480,0.014003,1252.072042,2353.579102,-400.27832,235.959640,-106.796005,0.056166,27.075779,-56.186157,-33.089775,False
1,0.056543,0.014956,1108.341028,2084.414063,-413.85336,264.328670,-105.791240,-4.294174,28.579357,-59.673830,-22.032665,False
2,0.046680,0.018747,1042.511071,1929.375000,-366.19098,220.370510,-78.318375,-7.798753,4.226619,-49.197327,-27.135250,False
3,0.039355,0.028922,965.001824,1914.301758,-368.97647,253.920900,-81.486330,-7.253069,31.278433,-45.258450,-23.949402,False
4,0.041016,0.027191,997.280091,1937.988281,-348.39618,230.374020,-76.895370,-3.285618,12.548368,-41.711308,-21.581696,False
...,...,...,...,...,...,...,...,...,...,...,...,...
68257,0.098730,0.000358,2081.511061,4500.439453,-638.57650,112.903730,-30.076542,27.566034,-18.518550,14.489972,-4.514703,False
68258,0.109082,0.000305,2281.715277,4289.414062,-649.44460,106.873000,-41.345960,22.769505,-27.545710,14.155927,4.928344,False
68259,0.112109,0.000119,2578.327334,5320.854492,-712.06320,90.672500,-15.080589,18.369724,-23.264654,20.386620,-1.585826,False
68260,0.108594,0.000069,2910.688507,6199.409180,-754.02630,78.074430,-7.041000,15.133078,-23.212465,14.904060,-7.863165,False


In [6]:
positive_df = full_df.loc[full_df["label"] == True]
negative_df = full_df.loc[full_df["label"] == False]
print(positive_df.shape[0])
print(negative_df.shape[0])

31462
36800


In [7]:
# Random Forest with k-fold cross validation

X = full_df.iloc[:, :11]
y = full_df.iloc[:, 11]

# Evaluating the quality of a model’s predictions using scoring parameter
kf = KFold(n_splits=5, random_state=1, shuffle=True)
rf = RandomForestClassifier()
scores = cross_validate(rf, X, y, scoring=['accuracy','balanced_accuracy','precision'], cv=kf, n_jobs=-1)
a = np.average(scores['test_accuracy'])
ba = np.average(scores['test_balanced_accuracy'])
p = np.average(scores['test_precision'])
print("Average Accuracy: ", a, " Average Balanced Accuracy: ", ba, " Average Precision: ", p)


# Evaluating the quality of a model's predictions using metric functions
k = 5
n = math.floor(X.shape[0]/5)
train_array = []
test_array = []
result = np.zeros((k))
for i in range(1, k+1):
    s1 = ((i-1) * n+1)
    s2 = (i * n)
    train = X[s1:(int)((s1+s2)/2)]
    test = y[(int)((s1+s2)/2) + 1 : s2]
    train_array.append(train)
    test_array.append(test)
for i in range(k):
    s1 = (i * n+1)
    s2 = ((i+1) * n)
    rf.fit(train_array[i], test_array[i])
    result[i] = balanced_accuracy_score(test_array[i], rf.predict(train_array[i]))
print("Average Balanced Accuracy", np.average(result))
    

Average Accuracy:  0.9038557480822706  Average Balanced Accuracy:  0.905031876951203  Average Precision:  0.8771107597829524
Average Balanced Accuracy 1.0


In [None]:
# SVM with k-fold cross validation

X = full_df.iloc[:, :11]
y = full_df.iloc[:, 11]

kf = KFold(n_splits=5, random_state=1, shuffle=True)
s_vm = svm.SVC()
scores = cross_val_score(s_vm, X, y, scoring='accuracy', cv=kf, n_jobs=-1)
print(scores)

In [None]:
# Adaboost with k-fold cross validation

X = full_df.iloc[:, :11]
y = full_df.iloc[:, 11]

kf = KFold(n_splits=5, random_state=1, shuffle=True)
ab = AdaBoostClassifier()
scores = cross_val_score(ab, X, y, scoring='accuracy', cv=kf, n_jobs=-1)
print(scores)

In [None]:
# Decisiion Tree with k-fold cross validation

X = full_df.iloc[:, :11]
y = full_df.iloc[:, 11]

kf = KFold(n_splits=5, random_state=1, shuffle=True)
dt = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(dt, X, y, scoring='accuracy', cv=kf, n_jobs=-1)
print(scores)

In [None]:
'''
# separate positive and negative data
positive_df = full_df.loc[full_df["label"] == True]
negative_df = full_df.loc[full_df["label"] == False]
'''

In [None]:
'''
# combine positive data and a portion of (25000) negative data
full_even_df = pd.concat([positive_df, shuffle(negative_df).iloc[:25000, :]], ignore_index=True)
'''

In [None]:
'''
full_even_df = shuffle(full_even_df)
full_even_df
'''

In [None]:
# Logistic Regression with PCA and k-fold cross validation

X = full_even_df.iloc[:, :11]
y = full_even_df.iloc[:, 11]

kf = KFold(n_splits=10, shuffle=True, random_state=1)
pca = PCA(n_components=5, svd_solver='arpack')

accuracies = []
for train_ix, test_ix in kf.split(X):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    
    X_train_1st = pca.fit_transform(X_train)
    X_test_1st = pca.fit_transform(X_test)
    lgr = LogisticRegression(random_state=0).fit(X_train_1st, y_train)
    y_pred = lgr.predict(X_test_1st)
    
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    
print(accuracies)

In [None]:
# Logistics Regression accuracy against PCA component number
X = full_even_df.iloc[:, :11]
y = full_even_df.iloc[:, 11]

kf = KFold(n_splits=10, shuffle=True, random_state=1)
accuracies = []

for i in range(10):
    accuracy = 0
    pca = PCA(n_components=i+1, svd_solver='arpack')
    for train_ix, test_ix in kf.split(X):
        X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]

        X_train_ith = pca.fit_transform(X_train)
        X_test_ith = pca.fit_transform(X_test)
        lgr = LogisticRegression(random_state=0).fit(X_train_ith, y_train)
        y_pred = lgr.predict(X_test_ith)
        
        accuracy += metrics.accuracy_score(y_test, y_pred)
    accuracy /= 10
    accuracies.append(accuracy)
    
plt.figure()
plt.plot(list(range(1, 11)), accuracies)  

plt.xlabel("PCA component number")
plt.ylabel("Logistics Regression accuracy")
plt.title("Logistics Regression accuracy against PCA component number")

plt.show()

In [None]:
# Random Forest accuracy against PCA component number
X = full_even_df.iloc[:, :11]
y = full_even_df.iloc[:, 11]

kf = KFold(n_splits=10, shuffle=True, random_state=1)
accuracies = []

for i in range(10):
    accuracy = 0
    pca = PCA(n_components=i+1, svd_solver='arpack')
    for train_ix, test_ix in kf.split(X):
        X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]

        X_train_ith = pca.fit_transform(X_train)
        X_test_ith = pca.fit_transform(X_test)
        rf = RandomForestClassifier().fit(X_train_ith, y_train)
        y_pred = rf.predict(X_test_ith)
        
        accuracy += metrics.accuracy_score(y_test, y_pred)
    accuracy /= 10
    accuracies.append(accuracy)
    
plt.figure()
plt.plot(list(range(1, 11)), accuracies)  

plt.xlabel("PCA component number")
plt.ylabel("Random Forest accuracy")
plt.title("Random Forest accuracy against PCA component number")

plt.show()