In [51]:
# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = True

In [52]:
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"


In [53]:
import numpy as np
import librosa
import csv
import os
import librosa.display
if(os.path.exists(TRAIN_CSV_FILE) and os.path.isfile(TRAIN_CSV_FILE)):
    os.remove(TRAIN_CSV_FILE)
if(os.path.exists(TEST_CSV_FILE) and os.path.isfile(TEST_CSV_FILE)):
    os.remove(TEST_CSV_FILE)
def extractWavFeatures(soundFilesFolder, csvFileName,label):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate '
    # header = 'filename '
    for i in range(1, 21):
        header += f'mfcc{i} '
    header += 'label '
    header = header.split()
    if not os.path.exists(csvFileName):
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
        writer.writerow(header)
    else:
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        y, sr = librosa.load(number, mono=True, duration=30)
        # remove leading and trailing silence
        y, index = librosa.effects.trim(y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append+=f' {label}'
        writer.writerow(to_append.split())

if (CREATE_CSV_FILES == True):
    extractWavFeatures("dataset2/Dina", TRAIN_CSV_FILE,0)
    extractWavFeatures("dataset2/Kareeman", TRAIN_CSV_FILE,1)
    extractWavFeatures("dataset2/Mariam", TRAIN_CSV_FILE,2)
    extractWavFeatures("dataset2/Nada", TRAIN_CSV_FILE,3)
    extractWavFeatures("dataset2/others", TRAIN_CSV_FILE,4)
    extractWavFeatures("dataset2/test", TEST_CSV_FILE,0)
    print("CSV files are created")
else:
    print("CSV files creation is skipped")


The features of the files in the folder dataset2/Dina will be saved to train.csv
The features of the files in the folder dataset2/Kareeman will be saved to train.csv
The features of the files in the folder dataset2/Mariam will be saved to train.csv
The features of the files in the folder dataset2/Nada will be saved to train.csv
The features of the files in the folder dataset2/others will be saved to train.csv
The features of the files in the folder dataset2/test will be saved to test.csv
CSV files are created


In [54]:
#Reading a dataset and convert file name to corresponding number

import pandas as pd
import csv
from sklearn import preprocessing

def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName, error_bad_lines=False)
    # data['number'] = data['filename'].str[:1]
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    # data = data.drop(['label'],axis=1)
    # data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    print(data.head())
    return data

trainData = preProcessData(TRAIN_CSV_FILE)
testData = preProcessData(TEST_CSV_FILE)
# moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
# moreTestData = preProcessData(MORE_TEST_CSV_FILE)



train.csv will be preprocessed
Preprocessing is finished
   chroma_stft      rmse  spectral_centroid  spectral_bandwidth      rolloff  \
0     0.315417  0.042465        1164.384818         1423.455020  2393.137680   
1     0.294068  0.054845        1228.853812         1514.037504  2405.984364   
2     0.277608  0.047859        1295.236308         1618.044875  2674.067523   
3     0.285265  0.043748        1121.270245         1367.419572  2222.114702   
4     0.280511  0.039898        1251.483286         1576.828595  2678.325389   

   zero_crossing_rate       mfcc1       mfcc2      mfcc3      mfcc4  ...  \
0            0.052868 -389.890411  150.855774  11.183009  22.068670  ...   
1            0.053414 -345.394775  129.519958  13.182385  21.692543  ...   
2            0.056149 -383.368439  117.782349  17.631424  22.915812  ...   
3            0.052430 -384.875732  137.560730  18.019766  20.900148  ...   
4            0.048354 -391.633545  138.709213  21.949665  27.736458  ...   

     



  data = pd.read_csv(csvFileName, error_bad_lines=False)


  data = pd.read_csv(csvFileName, error_bad_lines=False)


## Section 2

There are 50 recordings for each digit for each speaker: Jackson, Nicolas and Theo (total 1500 recordings)

Training data has 49 recordings for each digit for each speaker: 1470 recordings total.
Test data has 1 recordings for each digit for each speaker: 30 recordings total.

The data used here comes from the recordings stored in:
* ../data/recordings/train
* ../data/recordings/test

The model will be trained to predict the spoken digit.

In [55]:
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]

print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
print("Y from test data:", y_test.shape)


Y from training data: (283,)
Y from validation data: (71,)
Y from test data: (11,)


In [56]:
# #Normalizing the dataset
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# scaler = StandardScaler()
# X_train = scaler.fit_transform( X_train )
# X_val = scaler.transform( X_val )
# X_test = scaler.transform( X_test )

# print("X from training data", X_train.shape)
# print("X from validation data", X_val.shape)
# print("X from test data", X_test.shape)


In [57]:
# df = pd.DataFrame(X_train).set_index('filename')

In [58]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with default hyperparameters: 0.3099


In [59]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=10000.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with rbf kernel and C=100.0 : 0.9014


In [60]:
# instantiate classifier with linear kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1000.0) 


# fit classifier to training set
poly_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=poly_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(poly_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(poly_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.9155
Accuracy on training set: 0.922
Accuracy on test set: 0.915


In [61]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_val, y_val)))


Random Forests
Accuracy on training set: 0.979
Accuracy on test set: 0.831


In [62]:
import pickle 
pickle.dump(forest ,open('../trained_speaker_model.sav' , 'wb'))
model= pickle.load(open('../trained_speaker_model.sav' , 'rb'))

In [63]:
from sklearn.tree import DecisionTreeClassifier
#Train decision tree model
tree = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_val, y_val)))


Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.817


In [88]:
def extractWavFeatures():
    list_of_features=[]
    y, sr = librosa.load('dataset2/Kareeman/Recording(9).wav', mono=True, duration=30)
    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    list_of_features.append(np.mean(chroma_stft))
    list_of_features.append(np.mean(rmse))
    list_of_features.append(np.mean(spec_cent))
    list_of_features.append(np.mean(spec_bw))
    list_of_features.append(np.mean(rolloff))
    list_of_features.append(np.mean(zcr))

    for e in mfcc:
            list_of_features.append(np.mean(e))
    
    return(list_of_features)
speech_features=[]
speech_features.append(extractWavFeatures())
print(forest.predict(speech_features))
print(svc.predict(speech_features))
print(tree.predict(speech_features))
print(poly_svc.predict(speech_features))

[1]
[1]
[1]
[1]
