In [2]:
import tensorflow as tf
import os
import sys
from os import listdir
from os.path import isfile, join
import IPython.display as ipd
import librosa 
import librosa.display
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import wavfile as wav
import numpy as np
from timeit import default_timer as timer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Implementing CNN with MFCC feature only

In [3]:
filename='D:/UrbanSound8K/audio/fold1/103258-5-0-12.wav'
ipd.Audio(filename)

In [4]:
def extract_MFCC(file_name):
    audio, sample_rate=librosa.load(file_name, res_type='kaiser_fast')
    mfccs=np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate , n_mfcc=100).T,axis=0)
    
    return mfccs

In [5]:
start_time=timer()
file_name='D:/UrbanSound8K/audio/fold1/103258-5-0-12.wav'
a=extract_MFCC(file_name)
end_time=timer()
print('Time to extract MFCC from one file: {:.3f}sec'.format((end_time-start_time)/60))
print(a.shape)

Time to extract MFCC from one file: 0.059sec
(100,)


In [6]:
metadata=pd.read_csv('D:/UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.tail()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class_name
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.20985,255.741948,2,7,1,car_horn
8731,99812-1-6-0.wav,99812,332.289233,334.821332,2,7,1,car_horn


In [8]:
fold_list = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']

In [9]:
stacked_features = []
exceptions=0
count=0

start_time = timer()
for i in range(10):
    # get file names
    mypath = 'D:/UrbanSound8K/audio/'+ fold_list[i] + '/'
    files = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]
    
    for fn in files:
        try: # extract features
            mfccs = extract_MFCC(fn)
            features=np.reshape((mfccs),(20,5))
            count+=1
            if(count%500 == 0):
                print(count)
        
            
            
        except: # else exception (.ds_store files are part of mac file systems)
            print(fn)
            exceptions += 1
            continue
            
        l_row = metadata.loc[metadata['slice_file_name']==fn.split('/')[-1]].values.tolist()
        label = l_row[0][-1]
        fold = i+1
    
        stacked_features.append([features, features.shape, label, fold])
        
            #print(f,old_samplerate,ss)
        
print("Exceptions: ", exceptions)
end_time = timer()
print(print("time taken: {0} minutes {1:.1f} seconds".format((end_time - start_time)//60, (end_time - start_time)%60)))
print('Finished feature extraction from all folder')

D:/UrbanSound8K/audio/fold1/.DS_Store
500
D:/UrbanSound8K/audio/fold2/.DS_Store
1000
1500
D:/UrbanSound8K/audio/fold3/.DS_Store
2000
2500
D:/UrbanSound8K/audio/fold4/.DS_Store
3000
3500
D:/UrbanSound8K/audio/fold5/.DS_Store
4000
4500
D:/UrbanSound8K/audio/fold6/.DS_Store
5000
D:/UrbanSound8K/audio/fold7/.DS_Store
5500
6000
D:/UrbanSound8K/audio/fold8/.DS_Store
6500
7000
D:/UrbanSound8K/audio/fold9/.DS_Store
7500
D:/UrbanSound8K/audio/fold10/.DS_Store
8000
8500
Exceptions:  10
time taken: 24.0 minutes 25.5 seconds
None
Finished feature extraction from all folder


In [10]:
cols=['Stacked_Features', 'Matrix_Shape', 'Label', 'Fold']
Stacked_feature_pd=pd.DataFrame(data=stacked_features , columns=cols)
Stacked_feature_pd.head()
#Stacked_feature_pd.to_csv('C:/Users/16301148/Desktop/tHESIS/MFCC_features_CNN/MFCC_features_CNN.csv', index=False)

Unnamed: 0,Stacked_Features,Matrix_Shape,Label,Fold
0,"[[-403.96760320653647, 94.39807027241612, 17.7...","(20, 5)",dog_bark,1
1,"[[-368.11002233002216, 128.7448108062032, 28.9...","(20, 5)",dog_bark,1
2,"[[-422.4790151421462, 70.38346134423621, 9.934...","(20, 5)",dog_bark,1
3,"[[-260.24002598569047, 109.35180735253135, -51...","(20, 5)",dog_bark,1
4,"[[-386.78483744318396, 132.62961490772713, 25....","(20, 5)",gun_shot,1


In [None]:
#metadata_mfcc=pd.read_csv('C:/Users/16301148/Desktop/tHESIS/MFCC_features_CNN/MFCC_features_CNN.csv' )
#metadata_mfcc.head()

In [11]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical



X = np.array(Stacked_feature_pd.Stacked_Features.tolist())
y = np.array(Stacked_feature_pd.Label.tolist())


le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

Using TensorFlow backend.


In [11]:
X.shape

(8732, 20, 5)

In [12]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42)

In [13]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape, yy.shape)

(6985, 20, 5) (1747, 20, 5) (6985, 10) (1747, 10) (8732, 10)


In [13]:
x_train=np.reshape(x_train,(x_train.shape[0], 20,5))
x_test=np.reshape(x_test,(x_test.shape[0], 20,5))
print(x_train.shape, x_test.shape)

(6985, 20, 5) (1747, 20, 5)


In [20]:
x_train.shape[1]

20

In [21]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D, LSTM, TimeDistributed
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

In [22]:
model = Sequential()
model.add(LSTM(128, return_sequences =True, input_shape=(20,5)))
model.add(LSTM(128, return_sequences =True, dropout=0.3))
           
model.add(TimeDistributed(Dense(256, activation='relu')))
model.add(TimeDistributed(Dense(512, activation='relu')))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))

In [23]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 20, 128)           68608     
_________________________________________________________________
lstm_4 (LSTM)                (None, 20, 128)           131584    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 20, 256)           33024     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 20, 512)           131584    
_________________________________________________________________
flatten_2 (Flatten)          (None, 10240)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                102410    
Total params: 467,210
Trainable params: 467,210
Non-trainable params: 0
________________________________________________

In [24]:
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Pre-training accuracy: 9.0441%


In [25]:
start_time=timer()
model.fit(x_train,y_train,batch_size=50,epochs=30,validation_data=(x_test,y_test))
end_time=timer()
print(print("time taken: {0} minutes {1:.1f} seconds".format((end_time - start_time)//60, (end_time - start_time)%60)))

Instructions for updating:
Use tf.cast instead.
Train on 6985 samples, validate on 1747 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
time taken: 4.0 minutes 30.5 seconds
None


In [26]:
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9957050681114197
Testing Accuracy:  0.9307383894920349


In [29]:
def print_prediction(file_name):
    a=extract_MFCC(file_name) 
    prediction_feature=np.reshape((a),(20,5))
            
    prediction_feature = prediction_feature.reshape(1, 20, 5)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [30]:
filename= "D:/UrbanSound8K/audio/fold5/100852-0-0-0.wav"
print_prediction(filename)
ipd.Audio(filename)

The predicted class is: air_conditioner 

air_conditioner 		 :  0.99978858232498168945312500000000
car_horn 		 :  0.00000000000128836503097973231746
children_playing 		 :  0.00000001198026389204187580617145
dog_bark 		 :  0.00000000027511870559493445398402
drilling 		 :  0.00000962483863986562937498092651
engine_idling 		 :  0.00000001779418035141588916303590
gun_shot 		 :  0.00000000120495802224951376047102
jackhammer 		 :  0.00020094323554076254367828369141
siren 		 :  0.00000086150907918636221438646317
street_music 		 :  0.00000000000208954424943397221170


In [31]:
filename= "D:/UrbanSound8K/audio/fold3/103199-4-0-0.wav"
print_prediction(filename)
ipd.Audio(filename)

The predicted class is: drilling 

air_conditioner 		 :  0.00000000000015416902238336860353
car_horn 		 :  0.00000000000003212525596018797613
children_playing 		 :  0.00000115431566882762126624584198
dog_bark 		 :  0.00000985870246950071305036544800
drilling 		 :  0.99998843669891357421875000000000
engine_idling 		 :  0.00000000323638382759838805213803
gun_shot 		 :  0.00000029766641773676383309066296
jackhammer 		 :  0.00000000004729663188873800550027
siren 		 :  0.00000000000028327101948832922318
street_music 		 :  0.00000041441660414420766755938530


In [32]:
filename= "D:/UrbanSound8K/audio/fold10/100648-1-0-0.wav"
print_prediction(filename)
ipd.Audio(filename)

The predicted class is: car_horn 

air_conditioner 		 :  0.00000000003441445045604396568706
car_horn 		 :  1.00000000000000000000000000000000
children_playing 		 :  0.00000000000000000106722925349653
dog_bark 		 :  0.00000000000000047873021041480333
drilling 		 :  0.00000000000002806529321279244843
engine_idling 		 :  0.00000000000000000000930366547208
gun_shot 		 :  0.00000000000000012882784822119731
jackhammer 		 :  0.00000000000005566565681710058944
siren 		 :  0.00000000000000000003220921751834
street_music 		 :  0.00000000000052086915509896858367


In [33]:
Class_Label = {'Air_conditioner':0, 'Car_horn':1, 'Children_playing':2, 'Dog_Bark':3, 
               'Drilling':4,'Engine_idling':5, 'Gun_Shot' :6, 'Jackhammer': 7 ,'Siren':8,'Street_Music':9 }

In [34]:
from sklearn.metrics import classification_report

y_true = np.argmax(y_test, axis = 1)
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
labels = [0,1,2,3,4,5,6,7,8,9]
target_names = Class_Label.keys()

print(y_true.shape, y_pred.shape)
print(classification_report(y_true, y_pred, target_names=target_names))

(1747,) (1747,)
                  precision    recall  f1-score   support

 Air_conditioner       0.95      0.98      0.97       195
        Car_horn       0.93      0.93      0.93        73
Children_playing       0.87      0.92      0.89       198
        Dog_Bark       0.85      0.86      0.85       190
        Drilling       0.97      0.91      0.94       222
   Engine_idling       0.98      0.96      0.97       197
        Gun_Shot       0.85      0.82      0.84        57
      Jackhammer       0.96      0.99      0.98       217
           Siren       0.94      0.97      0.95       204
    Street_Music       0.94      0.88      0.91       194

        accuracy                           0.93      1747
       macro avg       0.92      0.92      0.92      1747
    weighted avg       0.93      0.93      0.93      1747



In [35]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_true, y_pred)*100)

93.07384087006297
