In [1]:
# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = True

In [2]:
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"


In [3]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
import librosa
import csv
import os
import chromaFeatures 
import librosa.display
if(os.path.exists(TRAIN_CSV_FILE) and os.path.isfile(TRAIN_CSV_FILE)):
    os.remove(TRAIN_CSV_FILE)
if(os.path.exists(TEST_CSV_FILE) and os.path.isfile(TEST_CSV_FILE)):
    os.remove(TEST_CSV_FILE)
def extractWavFeatures(soundFilesFolder, csvFileName,label):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate '
    # header = 'filename '
    for i in range(1, 21):
        header += f'mfcc{i} '
    header += 'label '
    header = header.split()
    if not os.path.exists(csvFileName):
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
        writer.writerow(header)
    else:
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
    genres = '1 2 3 4 5 6 7 8 9 0'.split()
    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        y, sr = librosa.load(number, mono=True, duration=30)
        # remove leading and trailing silence
        y, index = librosa.effects.trim(y)
        # chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_stft = chromaFeatures.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append+=f' {label}'
        writer.writerow(to_append.split())
        # writer.writerow(to_append.split())
    file.close()

if (CREATE_CSV_FILES == True):
    extractWavFeatures("recordings/close", TRAIN_CSV_FILE,1)
    extractWavFeatures("recordings/open", TRAIN_CSV_FILE,0)
    extractWavFeatures("recordings/open", TEST_CSV_FILE,0)
    print("CSV files are created")
else:
    print("CSV files creation is skipped")


The features of the files in the folder recordings/close will be saved to train.csv
The features of the files in the folder recordings/open will be saved to train.csv
The features of the files in the folder recordings/open will be saved to test.csv
CSV files are created


In [4]:
# melspec_mean_list=[]
# melspec_var_list=[]
# for i in range(1,16):
#     y, sr = librosa.load(f'..\\data\\recordings\\Dina\\open\\h_open_({i}).wav', mono=True, duration=30)
#     # this is the number of samples in a window per fft
#     # n_fft = 2048
#     # # The amount of samples we are shifting after each fft
#     # hop_length = 512
#     mel_signal = librosa.feature.melspectrogram(y=y, sr=sr)
#     melspec_mean = np.mean(mel_signal)
#     melspec_var = np.var(mel_signal)
#     melspec_mean_list.append(melspec_mean)
#     melspec_var_list.append(melspec_var)
#     # print(f'mean{i}:{melspec_mean}')
#     # print(f'var{i}:{melspec_var}')
#     # spectrogram = np.abs(mel_signal)
#     # power_to_db = librosa.power_to_db(spectrogram, ref=np.max)
#     # librosa.display.specshow(power_to_db, sr=sr, x_axis='time', y_axis='mel', cmap='magma', hop_length=hop_length)
# print(f'mean_max{max(melspec_mean_list)},mean_min:{min(melspec_mean_list)},mean_mean{np.mean(melspec_mean_list)}')
# print(f'var_max{max(melspec_var_list)},var_min:{min(melspec_var_list)},var_mean{np.mean(melspec_var_list)}')

In [5]:
# # melspec_mean_list=[]
# # melspec_var_list=[]
# # for i in range(1,16):
# y, sr = librosa.load(f'..\\data\\recordings\\Mariam\\close\\b_close_(1).wav', mono=True, duration=30)
# # this is the number of samples in a window per fft
# # n_fft = 2048
# # # The amount of samples we are shifting after each fft
# # hop_length = 512
# mel_signal = librosa.feature.melspectrogram(y=y, sr=sr)
# melspec_mean = np.mean(mel_signal)
# melspec_var = np.var(mel_signal)
# melspec_mean_list.append(melspec_mean)
# melspec_var_list.append(melspec_var)
# # print(f'mean{i}:{melspec_mean}')
# # print(f'var{i}:{melspec_var}')
# spectrogram = np.abs(mel_signal)
# power_to_db = librosa.power_to_db(spectrogram, ref=np.max)
# librosa.display.specshow(power_to_db, sr=sr, x_axis='time', y_axis='mel', cmap='magma', hop_length=hop_length)
# # print(f'mean_max{max(melspec_mean_list)},mean_min:{min(melspec_mean_list)},mean_mean{np.mean(melspec_mean_list)}')
# # print(f'var_max{max(melspec_var_list)},var_min:{min(melspec_var_list)},var_mean{np.mean(melspec_var_list)}')

In [6]:
# y, sr = librosa.load('..\\recordings\\test\\close(13).wav', mono=True, duration=30)
# mfcc = librosa.feature.mfcc(y=y, sr=sr)
# mfcc
# mfcc_list=[]
# for e in mfcc:
#             mfcc_list.append(e)
# print(len(mfcc_list))
# print(mfcc_list)
# # mfcc.shape



# import freature_ex as ex

# mfcc=ex.get_mfcc(y,sr)
# mfcc_list=[]
# for e in mfcc:
#             mfcc_list.append(e)
# print(len(mfcc_list))
# print(mfcc_list)

In [7]:
# y, sr = librosa.load('..\\recordings\\test\\close(13).wav', mono=True, duration=30)
# op=np.mean(librosa.feature.chroma_stft(y=y,sr=sr))
# op

In [8]:
# import chromaFeatures as ft 
# op= np.mean(ft.chroma_stft(y=y,sr=sr))
# op

In [9]:
#Reading a dataset and convert file name to corresponding number

import pandas as pd
import csv
from sklearn import preprocessing

def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName, error_bad_lines=False)
    # data['number'] = data['filename'].str[:1]
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    # data = data.drop(['label'],axis=1)
    # data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    print(data.head())
    return data

trainData = preProcessData(TRAIN_CSV_FILE)
testData = preProcessData(TEST_CSV_FILE)
# moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
# moreTestData = preProcessData(MORE_TEST_CSV_FILE)



train.csv will be preprocessed
Preprocessing is finished
   chroma_stft      rmse  spectral_centroid  spectral_bandwidth      rolloff  \
0     0.292520  0.019473        1567.592213         1884.101824  3277.399331   
1     0.302334  0.023669        1891.507695         2064.112689  4235.807720   
2     0.348398  0.020244        1533.034578         1912.124666  3618.361151   
3     0.299812  0.015033        1157.071959         1460.133266  2096.174504   
4     0.321138  0.023743        1227.748763         1635.783312  2557.649850   

   zero_crossing_rate       mfcc1       mfcc2      mfcc3      mfcc4  ...  \
0            0.081148 -459.894409  115.394875  24.466005  19.742487  ...   
1            0.100894 -399.525299  105.455101  32.384209  39.936787  ...   
2            0.066713 -436.080231  117.834846  23.701845  33.740929  ...   
3            0.061080 -465.447052  134.356430  25.707594  32.438938  ...   
4            0.055948 -426.604523  132.837311  16.490423  40.396248  ...   

     



  data = pd.read_csv(csvFileName, error_bad_lines=False)


  data = pd.read_csv(csvFileName, error_bad_lines=False)


## Section 2

There are 50 recordings for each digit for each speaker: Jackson, Nicolas and Theo (total 1500 recordings)

Training data has 49 recordings for each digit for each speaker: 1470 recordings total.
Test data has 1 recordings for each digit for each speaker: 30 recordings total.

The data used here comes from the recordings stored in:
* ../data/recordings/train
* ../data/recordings/test

The model will be trained to predict the spoken digit.

In [10]:
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]

print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
print("Y from test data:", y_test.shape)


Y from training data: (256,)
Y from validation data: (64,)
Y from test data: (160,)


In [11]:
# #Normalizing the dataset
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# scaler = StandardScaler()
# X_train = scaler.fit_transform( X_train )
# X_val = scaler.transform( X_val )
# X_test = scaler.transform( X_test )

# print("X from training data", X_train.shape)
# print("X from validation data", X_val.shape)
# print("X from test data", X_test.shape)


In [12]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with default hyperparameters: 0.5781


In [13]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=10000.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with rbf kernel and C=100.0 : 0.7969


In [14]:
# instantiate classifier with linear kernel and C=1.0
linear_svc=SVC(kernel='poly', C=1000.0) 


# fit classifier to training set
linear_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=linear_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(linear_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(linear_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.7500
Accuracy on training set: 0.773
Accuracy on test set: 0.750


In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_val, y_val)))


Random Forests
Accuracy on training set: 0.977
Accuracy on test set: 0.734


In [16]:
import pickle 
pickle.dump(forest ,open('../trainedModel.sav' , 'wb'))
model= pickle.load(open('../trainedModel.sav' , 'rb'))

In [17]:
from sklearn.tree import DecisionTreeClassifier
#Train decision tree model
tree = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_val, y_val)))


Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.766
