In [25]:
# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = True

In [26]:
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"


In [85]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
import librosa
import csv
import os
import chromaFeatures 
import librosa.display
if(os.path.exists(TRAIN_CSV_FILE) and os.path.isfile(TRAIN_CSV_FILE)):
    os.remove(TRAIN_CSV_FILE)
def extractWavFeatures(soundFilesFolder, csvFileName,label):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate '
    # header = 'filename '
    for i in range(1, 21):
        header += f'mfcc{i} '
    header += 'label '
    header = header.split()
    if not os.path.exists(csvFileName):
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
        writer.writerow(header)
    else:
        file = open(csvFileName, 'a', newline='')
        writer = csv.writer(file)
    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        y, sr = librosa.load(number, mono=True, duration=2.5)
        # remove leading and trailing silence
        y, index = librosa.effects.trim(y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        # chroma_stft = chromaFeatures.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append+=f' {label}'
        writer.writerow(to_append.split())
        # writer.writerow(to_append.split())
    file.close()

if (CREATE_CSV_FILES == True):
    extractWavFeatures("rec/others", TRAIN_CSV_FILE,2)
    extractWavFeatures("rec/close", TRAIN_CSV_FILE,1)
    extractWavFeatures("rec/open", TRAIN_CSV_FILE,0)
    print("CSV files are created")
else:
    print("CSV files creation is skipped")


The features of the files in the folder rec/others will be saved to train.csv
The features of the files in the folder rec/close will be saved to train.csv
The features of the files in the folder rec/open will be saved to train.csv
CSV files are created


In [86]:
#Reading a dataset and convert file name to corresponding number

import pandas as pd
import csv
from sklearn import preprocessing

def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName, error_bad_lines=False)
    # data['number'] = data['filename'].str[:1]
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    # data = data.drop(['label'],axis=1)
    # data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    # print(data.head())
    return data

trainData = preProcessData(TRAIN_CSV_FILE)
# testData = preProcessData(TEST_CSV_FILE)
# moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
# moreTestData = preProcessData(MORE_TEST_CSV_FILE)



train.csv will be preprocessed
Preprocessing is finished




  data = pd.read_csv(csvFileName, error_bad_lines=False)


In [87]:
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# X_test = np.array(testData.iloc[:, :-1], dtype = float)
# y_test = testData.iloc[:, -1]

print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
# print("Y from test data:", y_test.shape)


Y from training data: (576,)
Y from validation data: (144,)


In [30]:
# import SVC classifier
from sklearn.svm import SVC


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with default hyperparameters: 0.3819


In [31]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=10000.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with rbf kernel and C=100.0 : 0.8333


In [32]:
# instantiate classifier with linear kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1000000.0) 


# fit classifier to training set
poly_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=poly_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(poly_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(poly_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.8194
Accuracy on training set: 0.931
Accuracy on test set: 0.819


In [88]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=25, random_state=0).fit(X_train, y_train)
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_val, y_val)))


Random Forests
Accuracy on training set: 1.000
Accuracy on test set: 0.868


In [34]:
def extractWavFeatures():
    list_of_features=[]
    y, sr = librosa.load('../audio/audio.wav', mono=True, duration=2.5)
    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    list_of_features.append(np.mean(chroma_stft))
    list_of_features.append(np.mean(rmse))
    list_of_features.append(np.mean(spec_cent))
    list_of_features.append(np.mean(spec_bw))
    list_of_features.append(np.mean(rolloff))
    list_of_features.append(np.mean(zcr))

    for e in mfcc:
            list_of_features.append(np.mean(e))
    
    return(list_of_features)

In [35]:
speech_features=[]
speech_features.append(extractWavFeatures())

In [36]:
forest.predict(speech_features)

array([1], dtype=int64)

In [37]:
svc.predict(speech_features)

array([0], dtype=int64)

In [38]:
from sklearn.tree import DecisionTreeClassifier
#Train decision tree model
tree = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_val, y_val)))


Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.799


In [89]:
import pickle 
pickle.dump(forest ,open('../trainedModel.sav' , 'wb'))
model= pickle.load(open('../trainedModel.sav' , 'rb'))

In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
gmm=GaussianMixture(n_components=5)
gmm.fit(X_train,y_train)
ygmm_pred_class = gmm.predict(X_val)
print(accuracy_score(y_val, ygmm_pred_class))
print(confusion_matrix(y_val, ygmm_pred_class))

0.24305555555555555
[[14  2 12 12]
 [20 11 12  5]
 [29  0 10 17]
 [ 0  0  0  0]]


In [41]:
# instantiate classifier with linear kernel and C=1.0
rbf_svc=SVC(kernel='rbf', C=500000.0) 


# fit classifier to training set
rbf_svc.fit(X_train,y_train)


# make predictions on test set
y_pred_test=rbf_svc.predict(X_val)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_val, y_pred_test)))
print("Accuracy on training set: {:.3f}".format(rbf_svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rbf_svc.score(X_val, y_val)))

Model accuracy score with linear kernel and C=1.0 : 0.7917
Accuracy on training set: 0.943
Accuracy on test set: 0.792


In [42]:
def extractWavFeatures():
    list_of_features=[]
    y, sr = librosa.load('../audio/audio.wav', mono=True, duration=2.5)
    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    list_of_features.append(np.mean(chroma_stft))
    list_of_features.append(np.mean(rmse))
    list_of_features.append(np.mean(spec_cent))
    list_of_features.append(np.mean(spec_bw))
    list_of_features.append(np.mean(rolloff))
    list_of_features.append(np.mean(zcr))

    for e in mfcc:
            list_of_features.append(np.mean(e))
    
    return(list_of_features)
speech_features=[]
speech_features.append(extractWavFeatures())
print(forest.predict(speech_features))
print(svc.predict(speech_features))
print(tree.predict(speech_features))
print(poly_svc.predict(speech_features))
print(rbf_svc.predict(speech_features))
# print(gmm.predict(speech_features))

[1]
[0]
[2]
[0]
[1]


In [43]:
from scipy.io.wavfile import read

In [44]:
def extractFeatures(y,sr):
    list_of_features=[]
    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)
    print(y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    list_of_features.append(np.mean(chroma_stft))
    list_of_features.append(np.mean(rmse))
    list_of_features.append(np.mean(spec_cent))
    list_of_features.append(np.mean(spec_bw))
    list_of_features.append(np.mean(rolloff))
    list_of_features.append(np.mean(zcr))
    mfccs=[]
    for e in mfcc:
            mfccs.append(np.mean(e))
    
    return np.hstack((list_of_features,mfccs))

In [45]:
def train_model(soundFilesFolder):

	feature = np.asarray(())
	for filename in os.listdir(soundFilesFolder):    

		number = f'{soundFilesFolder}/{filename}'
		audio, sr = librosa.load(number, mono=True, duration=2.5)
		# sr,audio = read(number)
		print(sr)
		vector   = extractFeatures(audio,sr)
		
		
		if feature.size == 0:
			print('vector')
			feature = vector

		else:
			feature = np.vstack((feature, vector))


	gmm = GaussianMixture(n_components = 6, max_iter = 200, covariance_type='diag',n_init = 3)
	gmm.fit(feature)

	# dumping the trained gaussian model
	picklefile = soundFilesFolder.split("/")[-1]+".gmm"
	pickle.dump(gmm,open(f'../{picklefile}','wb'))
	print('+ modeling completed for speaker:',picklefile," with data point = ",feature.shape)   
	feature = np.asarray(())

In [75]:
train_model("newrec/open")
train_model("newrec/others")
#train_model("rec/others")

22050
[-0.00018254 -0.00018562 -0.0001748  ...  0.06947094  0.06782467
  0.09004407]
vector
22050
[ 1.40542426e-04  1.12225935e-04  8.89846779e-05 ... -3.23765154e-04
 -2.35908476e-04 -1.05013169e-04]
22050
[ 0.00033856  0.00036505  0.00040724 ... -0.00921643 -0.00875335
 -0.00563499]
22050
[ 0.00044324  0.00047742  0.00049586 ... -0.00029099 -0.0003286
 -0.00056356]
22050
[ 0.00024133  0.00025687  0.00029747 ... -0.01811349 -0.01845731
 -0.02007313]
22050
[-0.00036922 -0.00039672 -0.00039657 ... -0.00183985 -0.00182444
 -0.00188741]
22050
[0.00015304 0.00017658 0.00021368 ... 0.0001656  0.00016813 0.00014476]
22050
[-0.00052244 -0.00051186 -0.00048813 ... -0.00131245 -0.0011854
 -0.00123955]
22050
[-0.0002768  -0.00025817 -0.00021809 ... -0.00155373 -0.00142404
 -0.00184758]
22050
[ 0.00013073  0.00016686  0.0001575  ... -0.00010608 -0.00036912
 -0.00056906]
22050
[0.00030923 0.00033778 0.00036373 ... 0.00091748 0.00088759 0.00085802]
22050
[ 3.4500910e-05  2.2717488e-05  1.4706326e-0

In [84]:
def test_model():

	

	gmm_files = ['../open.gmm','../others.gmm']

	#Load the Gaussian gender Models
	models    = [pickle.load(open(fname,'rb')) for fname in gmm_files]
	words   = ['open','others']
	  

	audio, sr = librosa.load('../audio/audio.wav', mono=True, duration=2.5)
	vector   = extractFeatures(audio,sr)

	log_likelihood = np.zeros(len(models)) 

	for i in range(len(models)):
		gmm    = models[i]  #checking with each model one by one
		scores = np.array(gmm.score(vector.reshape(1,-1)))
		log_likelihood[i] = scores.sum()

	print(log_likelihood)
	winner = np.argmax(log_likelihood)
	print("\tdetected as - ", words[winner])
test_model()

[-8.2107275e-05 -8.8565568e-05 -9.3367438e-05 ...  3.0893902e-04
  2.7509200e-04  2.1881795e-04]
[-91.73662548 -91.5272346 ]
	detected as -  others


In [48]:
def extractWavFeatures():
    list_of_features=[]
    y, sr = librosa.load('../audio/audio.wav', mono=True, duration=2.5)
    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    list_of_features.append(np.mean(chroma_stft))
    list_of_features.append(np.mean(rmse))
    list_of_features.append(np.mean(spec_cent))
    list_of_features.append(np.mean(spec_bw))
    list_of_features.append(np.mean(rolloff))
    list_of_features.append(np.mean(zcr))

    for e in mfcc:
            list_of_features.append(np.mean(e))
    
    return(list_of_features)
speech_features=[]
speech_features.append(extractWavFeatures())
print(forest.predict(speech_features))
print(svc.predict(speech_features))
print(tree.predict(speech_features))
print(poly_svc.predict(speech_features))
print(rbf_svc.predict(speech_features))
# print(gmm.predict(speech_features))

[0]
[1]
[1]
[1]
[0]
