In [None]:
# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = True

In [None]:
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"


In [None]:
import numpy as np
import librosa
import csv
import os
import librosa.display
if(os.path.exists(TRAIN_CSV_FILE) and os.path.isfile(TRAIN_CSV_FILE)):
    os.remove(TRAIN_CSV_FILE)
if(os.path.exists(TEST_CSV_FILE) and os.path.isfile(TEST_CSV_FILE)):
    os.remove(TEST_CSV_FILE)
def extractWavFeatures(soundFilesFolder, csvFileName,label):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename '
    # header = 'filename '
    for i in range(1, 13):
        header += f'chroma{i} '
    for i in range(1, 129):
        header += f'mel{i} '
    for i in range(1, 8):
        header += f'contrast{i} '
    for i in range(1, 7):
        header += f'tonnetz{i} '
    for i in range(1, 41):
        header += f'mfcc{i} '
    header += 'label '
    header = header.split()
    if not os.path.exists(csvFileName):
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
        writer.writerow(header)
    else:
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        X, sample_rate = librosa.load(number, mono=True, duration=30)
        
        # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

        # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
        stft = np.abs(librosa.stft(X))

        # Computes a chromagram from a waveform or power spectrogram.
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

        # Computes a mel-scaled spectrogram.
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

        # Computes spectral contrast
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

        # Computes the tonal centroid features (tonnetz)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
        sr=sample_rate).T,axis=0)


        to_append = f'{filename}'
        for e in chroma:
            to_append += f' {np.mean(e)}'
        for e in mel:
            to_append += f' {np.mean(e)}'
        for e in contrast:
            to_append += f' {np.mean(e)}'
        for e in tonnetz:
            to_append += f' {np.mean(e)}'
        for e in mfccs:
            to_append += f' {np.mean(e)}'
        to_append+=f' {label}'
        writer.writerow(to_append.split())
        # writer.writerow(to_append.split())
    file.close()

if (CREATE_CSV_FILES == True):
    extractWavFeatures("dataset2/Dina", TRAIN_CSV_FILE,0)
    extractWavFeatures("dataset2/Kareeman", TRAIN_CSV_FILE,1)
    extractWavFeatures("dataset2/Mariam", TRAIN_CSV_FILE,2)
    extractWavFeatures("dataset2/Nada", TRAIN_CSV_FILE,3)
    extractWavFeatures("dataset2/others", TRAIN_CSV_FILE,4)
    extractWavFeatures("dataset2/test", TEST_CSV_FILE,0)
    print("CSV files are created")
else:
    print("CSV files creation is skipped")


In [None]:
X, sample_rate = librosa.load("dataset2/Dina/h_close_(1).wav", mono=True, duration=30)
stft = np.abs(librosa.stft(X))

# chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
# print(chroma)

# mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
# mel.shape

mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
mfccs.shape

# contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
# contrast.shape


tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
tonnetz.shape

In [None]:
# melspec_mean_list=[]
# melspec_var_list=[]
# for i in range(1,16):
#     y, sr = librosa.load(f'..\\data\\recordings\\Dina\\open\\h_open_({i}).wav', mono=True, duration=30)
#     # this is the number of samples in a window per fft
#     # n_fft = 2048
#     # # The amount of samples we are shifting after each fft
#     # hop_length = 512
#     mel_signal = librosa.feature.melspectrogram(y=y, sr=sr)
#     melspec_mean = np.mean(mel_signal)
#     melspec_var = np.var(mel_signal)
#     melspec_mean_list.append(melspec_mean)
#     melspec_var_list.append(melspec_var)
#     # print(f'mean{i}:{melspec_mean}')
#     # print(f'var{i}:{melspec_var}')
#     # spectrogram = np.abs(mel_signal)
#     # power_to_db = librosa.power_to_db(spectrogram, ref=np.max)
#     # librosa.display.specshow(power_to_db, sr=sr, x_axis='time', y_axis='mel', cmap='magma', hop_length=hop_length)
# print(f'mean_max{max(melspec_mean_list)},mean_min:{min(melspec_mean_list)},mean_mean{np.mean(melspec_mean_list)}')
# print(f'var_max{max(melspec_var_list)},var_min:{min(melspec_var_list)},var_mean{np.mean(melspec_var_list)}')

In [None]:
# # melspec_mean_list=[]
# # melspec_var_list=[]
# # for i in range(1,16):
# y, sr = librosa.load(f'..\\data\\recordings\\Mariam\\close\\b_close_(1).wav', mono=True, duration=30)
# # this is the number of samples in a window per fft
# # n_fft = 2048
# # # The amount of samples we are shifting after each fft
# # hop_length = 512
# mel_signal = librosa.feature.melspectrogram(y=y, sr=sr)
# melspec_mean = np.mean(mel_signal)
# melspec_var = np.var(mel_signal)
# melspec_mean_list.append(melspec_mean)
# melspec_var_list.append(melspec_var)
# # print(f'mean{i}:{melspec_mean}')
# # print(f'var{i}:{melspec_var}')
# spectrogram = np.abs(mel_signal)
# power_to_db = librosa.power_to_db(spectrogram, ref=np.max)
# librosa.display.specshow(power_to_db, sr=sr, x_axis='time', y_axis='mel', cmap='magma', hop_length=hop_length)
# # print(f'mean_max{max(melspec_mean_list)},mean_min:{min(melspec_mean_list)},mean_mean{np.mean(melspec_mean_list)}')
# # print(f'var_max{max(melspec_var_list)},var_min:{min(melspec_var_list)},var_mean{np.mean(melspec_var_list)}')

In [None]:
#Reading a dataset and convert file name to corresponding number

import pandas as pd
import csv
from sklearn import preprocessing

def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName, error_bad_lines=False)
    # data['number'] = data['filename'].str[:1]
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    # data = data.drop(['label'],axis=1)
    # data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    print(data.head())
    return data

trainData = preProcessData(TRAIN_CSV_FILE)
testData = preProcessData(TEST_CSV_FILE)
# moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
# moreTestData = preProcessData(MORE_TEST_CSV_FILE)



## Section 2

There are 50 recordings for each digit for each speaker: Jackson, Nicolas and Theo (total 1500 recordings)

Training data has 49 recordings for each digit for each speaker: 1470 recordings total.
Test data has 1 recordings for each digit for each speaker: 30 recordings total.

The data used here comes from the recordings stored in:
* ../data/recordings/train
* ../data/recordings/test

The model will be trained to predict the spoken digit.

In [None]:
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]

print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
print("Y from test data:", y_test.shape)


In [None]:
# #Normalizing the dataset
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# scaler = StandardScaler()
# X_train = scaler.fit_transform( X_train )
# X_val = scaler.transform( X_val )
# X_test = scaler.transform( X_test )

# print("X from training data", X_train.shape)
# print("X from validation data", X_val.shape)
# print("X from test data", X_test.shape)


In [None]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

In [None]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=10000.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

In [65]:
# instantiate classifier with linear kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1000.0) 


# fit classifier to training set
poly_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=poly_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(poly_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(poly_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.9730
Accuracy on training set: 1.000
Accuracy on test set: 0.973


In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_val, y_val)))

In [66]:
import pickle 
pickle.dump(poly_svc ,open('../trained_speaker_model.sav' , 'wb'))
model= pickle.load(open('trainedModel.sav' , 'rb'))

In [None]:
from sklearn.tree import DecisionTreeClassifier
#Train decision tree model
tree = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_val, y_val)))