In [6]:
# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = True

In [7]:
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"


In [8]:
import warnings
warnings.filterwarnings("ignore")

mfcc
stft
chroma.stft
mel
spectral 
tonnez

In [9]:

import numpy as np
import librosa
import csv
import os
import librosa.display
if(os.path.exists(TRAIN_CSV_FILE) and os.path.isfile(TRAIN_CSV_FILE)):
    os.remove(TRAIN_CSV_FILE)
if(os.path.exists(TEST_CSV_FILE) and os.path.isfile(TEST_CSV_FILE)):
    os.remove(TEST_CSV_FILE)
def extractWavFeatures(soundFilesFolder, csvFileName,label):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename '
    # header = 'filename '
    for i in range(1, 13):
        header += f'chroma{i} '
    for i in range(1, 129):
        header += f'mel{i} '
    for i in range(1, 8):
        header += f'contrast{i} '
    for i in range(1, 7):
        header += f'tonnetz{i} '
    for i in range(1, 41):
        header += f'mfcc{i} '
    header += 'label '
    header = header.split()
    if not os.path.exists(csvFileName):
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
        writer.writerow(header)
    else:
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        X, sample_rate = librosa.load(number, mono=True, duration=2.5)
        
        # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

        # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
        stft = np.abs(librosa.stft(X))

        # Computes a chromagram from a waveform or power spectrogram.
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

        # Computes a mel-scaled spectrogram.
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

        # Computes spectral contrast
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

        # Computes the tonal centroid features (tonnetz)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
        sr=sample_rate).T,axis=0)


        to_append = f'{filename}'
        for e in chroma:
            to_append += f' {np.mean(e)}'
        for e in mel:
            to_append += f' {np.mean(e)}'
        for e in contrast:
            to_append += f' {np.mean(e)}'
        for e in tonnetz:
            to_append += f' {np.mean(e)}'
        for e in mfccs:
            to_append += f' {np.mean(e)}'
        to_append+=f' {label}'
        writer.writerow(to_append.split())
        # writer.writerow(to_append.split())
    file.close()

if (CREATE_CSV_FILES == True):
    extractWavFeatures("dataset2/Dina", TRAIN_CSV_FILE,0)
    extractWavFeatures("dataset2/Kareeman", TRAIN_CSV_FILE,1)
    extractWavFeatures("dataset2/Mariam", TRAIN_CSV_FILE,2)
    extractWavFeatures("dataset2/Nada", TRAIN_CSV_FILE,3)
    extractWavFeatures("dataset1/Dina", TRAIN_CSV_FILE,0)
    extractWavFeatures("dataset1/Kareeman", TRAIN_CSV_FILE,1)
    extractWavFeatures("dataset1/Mariam", TRAIN_CSV_FILE,2)
    extractWavFeatures("dataset1/Nada", TRAIN_CSV_FILE,3)
    extractWavFeatures("dataset2/others", TRAIN_CSV_FILE,4)
    # extractWavFeatures("dataset2/test", TEST_CSV_FILE,0)
    print("CSV files are created")
else:
    print("CSV files creation is skipped")


The features of the files in the folder dataset2/Dina will be saved to train.csv
The features of the files in the folder dataset2/Kareeman will be saved to train.csv
The features of the files in the folder dataset2/Mariam will be saved to train.csv
The features of the files in the folder dataset2/Nada will be saved to train.csv
The features of the files in the folder dataset1/Dina will be saved to train.csv
The features of the files in the folder dataset1/Kareeman will be saved to train.csv
The features of the files in the folder dataset1/Mariam will be saved to train.csv
The features of the files in the folder dataset1/Nada will be saved to train.csv
The features of the files in the folder dataset2/others will be saved to train.csv
CSV files are created


In [10]:
#Reading a dataset and convert file name to corresponding number

import pandas as pd
import csv
from sklearn import preprocessing

def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName, error_bad_lines=False)
    # data['number'] = data['filename'].str[:1]
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    # data = data.drop(['label'],axis=1)
    # data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    print(data.head())
    return data

trainData = preProcessData(TRAIN_CSV_FILE)
# testData = preProcessData(TEST_CSV_FILE)
# moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
# moreTestData = preProcessData(MORE_TEST_CSV_FILE)



train.csv will be preprocessed
Preprocessing is finished
    chroma1   chroma2   chroma3   chroma4   chroma5   chroma6   chroma7  \
0  0.435208  0.444573  0.507232  0.493202  0.403009  0.323319  0.335864   
1  0.558728  0.563941  0.495348  0.457669  0.405068  0.479695  0.518341   
2  0.457078  0.491001  0.538831  0.476358  0.454127  0.356903  0.327614   
3  0.482293  0.471162  0.500524  0.463510  0.415543  0.368639  0.385356   
4  0.470483  0.463248  0.499932  0.471146  0.406375  0.356658  0.377193   

    chroma8   chroma9  chroma10  ...    mfcc32    mfcc33    mfcc34    mfcc35  \
0  0.455849  0.493729  0.519837  ... -1.595734  4.662979 -0.072742  4.025115   
1  0.496340  0.528902  0.617000  ... -2.818082  3.372691 -1.455287  1.969472   
2  0.409163  0.427747  0.490952  ...  0.817995  2.196296 -0.189620  2.099922   
3  0.503881  0.511764  0.581159  ...  0.331302  1.773823  1.733602  3.554198   
4  0.485798  0.527508  0.539757  ...  0.124184  4.012275  3.699968  3.839281   

     mfcc36

## Section 2

There are 50 recordings for each digit for each speaker: Jackson, Nicolas and Theo (total 1500 recordings)

Training data has 49 recordings for each digit for each speaker: 1470 recordings total.
Test data has 1 recordings for each digit for each speaker: 30 recordings total.

The data used here comes from the recordings stored in:
* ../data/recordings/train
* ../data/recordings/test

The model will be trained to predict the spoken digit.

In [11]:
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# X_test = np.array(testData.iloc[:, :-1], dtype = float)
# y_test = testData.iloc[:, -1]

print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
# print("Y from test data:", y_test.shape)


Y from training data: (536,)
Y from validation data: (134,)


In [12]:
# #Normalizing the dataset
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# scaler = StandardScaler()
# X_train = scaler.fit_transform( X_train )
# X_val = scaler.transform( X_val )
# X_test = scaler.transform( X_test )

# print("X from training data", X_train.shape)
# print("X from validation data", X_val.shape)
# print("X from test data", X_test.shape)


In [13]:
# df = pd.DataFrame(X_train).set_index('filename')

In [14]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with default hyperparameters: 0.7090


In [15]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=10000.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with rbf kernel and C=100.0 : 0.9328


In [16]:
# instantiate classifier with linear kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1000.0) 


# fit classifier to training set
poly_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=poly_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(poly_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(poly_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.9328
Accuracy on training set: 1.000
Accuracy on test set: 0.933


In [17]:
# instantiate classifier with linear kernel and C=1.0
rbf_svc=SVC(kernel='rbf', C=1000.0) 


# fit classifier to training set
rbf_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=rbf_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(rbf_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rbf_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.9328
Accuracy on training set: 1.000
Accuracy on test set: 0.933


In [18]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_val, y_val)))


Random Forests
Accuracy on training set: 0.991
Accuracy on test set: 0.881


In [19]:
from sklearn.tree import DecisionTreeClassifier
#Train decision tree model
tree = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_val, y_val)))


Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.739


In [20]:
import pickle 
pickle.dump(forest ,open('../trained_speaker_model.sav' , 'wb'))
model= pickle.load(open('../trained_speaker_model.sav' , 'rb'))

In [21]:
def extractWavFeatures():
    list_of_features=[]

    X, sample_rate = librosa.load('../audio/audio.wav', mono=True, duration=2.5)
    
    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)

    for e in chroma:
        list_of_features.append (np.mean(e))
    for e in mel:
        list_of_features.append (np.mean(e))
    for e in contrast:
        list_of_features.append (np.mean(e))
    for e in tonnetz:
        list_of_features.append (np.mean(e))
    for e in mfccs:
        list_of_features.append (np.mean(e))
    
    return(list_of_features)


speech_features=[]

speech_features.append(extractWavFeatures())
print(forest.predict(speech_features))
print(svc.predict(speech_features))
print(tree.predict(speech_features))
print(poly_svc.predict(speech_features))
print(rbf_svc.predict(speech_features))

[2]
[4]
[2]
[4]
[4]


In [15]:
import librosa
import numpy as np
X, sample_rate = librosa.load('dataset1/Nada/Recording.wav', mono=True, duration=2.5)
    
    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 

mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
list_of_features=[]
for e in mfccs:
        list_of_features.append (np.mean(e))
list_of_features

[-510.5929,
 88.87672,
 31.873777,
 17.811573,
 3.0700498,
 4.767426,
 -5.014845,
 -2.5228958,
 -7.2420354,
 1.643732,
 -1.8049115,
 -7.4008164,
 -6.4478846,
 3.066079,
 -2.176268,
 -4.572109,
 -6.106249,
 -8.015044,
 -2.8094447,
 -6.4979606,
 -1.7176459,
 -4.9890847,
 -0.52704436,
 2.449116,
 1.0694981,
 -0.5652438,
 1.8900669,
 1.4032432,
 -2.0896835,
 -1.0223411,
 -0.4114135,
 4.6742864,
 4.0699544,
 4.6058636,
 4.795103,
 3.6240046,
 1.9721831,
 0.77030706,
 -0.7970575,
 0.53832895]