#IMPORT MODULES

In [5]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import os
import IPython.display as ipd
%pylab inline

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


### LOADING OF DATASET


In [6]:
dataset_path = '/content/drive/MyDrive/datasets/UrbanSound8K/metadata/UrbanSound8K.csv'
df = pd.read_csv(dataset_path)

In [7]:
df

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [46]:
df['class'].value_counts()

air_conditioner     1000
engine_idling       1000
street_music        1000
dog_bark            1000
drilling            1000
children_playing    1000
jackhammer          1000
siren                929
car_horn             429
gun_shot             374
Name: class, dtype: int64

In [44]:
df['class'].nunique()

10

#Extract features

Here we will be using Mel-Frequency Cepstral Coefficients(MFCC) from the audio samples. The MFCC summarises the frequency distribution across the window size, so it is possible to analyse both the frequency and time characteristics of the sound. These audio representations will allow us to identify features for classification.


In [10]:
audio_dataset_path = '/content/drive/MyDrive/datasets/UrbanSound8K/audio'

In [11]:
### Extracting MFCC's for each file
def features_extractor(file): 
  audio, sample_rate = librosa.load( file_name, res_type='kaiser_fast' )
  mfccs_features = librosa.feature.mfcc(y= audio,sr= sample_rate,n_mfcc=50)
  mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
  
  return mfccs_scaled_features


In [12]:
from tqdm import tqdm
## iterating through each audio file and extracting features 

extracted_features = []
for index_num,row in tqdm(df.iterrows()):
  file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold'+str(row["fold"])+'/' ,str(row["slice_file_name"]))
  final_class_labels = row['class']
  data = features_extractor(file_name)
  extracted_features.append([data,final_class_labels])

8732it [1:40:13,  1.45it/s]


In [15]:
#converting to dataframe
extracted_features_df = pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df

Unnamed: 0,feature,class
0,"[-215.79301, 71.66612, -131.81377, -52.09133, ...",dog_bark
1,"[-424.68677, 110.56227, -54.148235, 62.01074, ...",children_playing
2,"[-459.56467, 122.800354, -47.92471, 53.265705,...",children_playing
3,"[-414.55377, 102.896904, -36.66495, 54.18041, ...",children_playing
4,"[-447.397, 115.0954, -53.809113, 61.60859, 1.6...",children_playing
...,...,...
8727,"[-399.2257, 136.81903, -51.964222, 37.02399, -...",car_horn
8728,"[-346.72733, 87.48847, -46.265022, 52.748833, ...",car_horn
8729,"[-304.61316, 112.6199, -47.161945, 37.00349, -...",car_horn
8730,"[-344.71423, 126.75814, -56.17717, 36.070927, ...",car_horn


#Split dataset to dependent and independent dataset


In [16]:
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

In [17]:
X.shape

(8732, 50)

In [21]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))


In [22]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [23]:
X_train.shape

(6985, 50)

In [24]:
X_test.shape

(1747, 50)

In [25]:
y_train.shape

(6985, 10)

In [26]:
y_test.shape

(1747, 10)

In [27]:
X_train

array([[-1.31836136e+02,  1.13974640e+02, -2.39568615e+01, ...,
        -3.21555883e-01, -2.60312963e+00, -5.28483009e+00],
       [-1.40742197e+01,  9.19169388e+01, -8.67872047e+00, ...,
         8.18721175e-01, -1.85458887e+00, -1.51959229e+00],
       [-4.95320282e+01,  1.55218571e-01, -2.03691101e+01, ...,
         1.09204650e+00, -1.28535736e+00, -4.24328409e-02],
       ...,
       [-4.26993286e+02,  9.28906479e+01,  3.02333689e+00, ...,
         9.76593256e-01,  4.03740501e+00,  3.50559616e+00],
       [-1.46070236e+02,  1.37094589e+02, -3.42983437e+01, ...,
         1.07936025e-01, -9.57257211e-01,  2.96485841e-01],
       [-4.21674500e+02,  2.11690323e+02,  2.68203044e+00, ...,
        -2.79134536e+00, -2.42685723e+00, -1.11503243e+00]], dtype=float32)

In [28]:
y_train

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

# Model Creation

In [29]:
import tensorflow as tf


In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import  Adam
from sklearn import metrics

In [31]:
### no of classes
num_labels = y.shape[1]  

In [32]:
model =Sequential()
# FIRST LAYER
model.add(Dense(100,input_shape=(50,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# SECOND LAYER
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# THIRD LAYER
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# FINAL LAYER
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               5100      
_________________________________________________________________
activation (Activation)      (None, 100)               0         
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
activation_1 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               2

In [34]:
model.compile(loss='categorical_crossentropy' ,metrics=['accuracy'], optimizer='adam')

#Training Model


In [48]:
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

epoch = 100
num_batch_size = 64

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()
model.fit(X_train,y_train,batch_size=num_batch_size,epochs=epoch, validation_data=(X_test,y_test), callbacks=[checkpointer],verbose=1)

duration = datetime.now() - start
print('Training completed in time:', duration)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.57943, saving model to saved_models/audio_classification.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.57943
Epoch 3/100

Epoch 00003: val_loss improved from 0.57943 to 0.57316, saving model to saved_models/audio_classification.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.57316
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.57316
Epoch 6/100

Epoch 00006: val_loss improved from 0.57316 to 0.57117, saving model to saved_models/audio_classification.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.57117 to 0.56776, saving model to saved_models/audio_classification.hdf5
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.56776
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.56776
Epoch 10/100

Epoch 00010: val_loss improved from 0.56776 to 0.56328, saving model to saved_models/audio_classification.hdf5
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.5632

In [49]:
test_accuracy = model.evaluate(X_test,y_test,verbose = 0)
print(test_accuracy[1])

0.8225529193878174


#Testing of Audio Data 
###preprocess the new audio data
###predict the classes
###inverse transform the predicted label







In [51]:
example_filename = '/content/drive/MyDrive/datasets/UrbanSound8K/audio/fold1/24074-1-0-12.wav'
audio, sample_rate = librosa.load(example_filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=50)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict_classes(mfccs_scaled_features)
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label) 
prediction_class

[-165.20825     128.73132     -59.58641      19.64161      -3.2478533
    7.555708      1.2540799    24.023901    -18.398355     16.389967
   -4.5462475    -6.0966473   -11.555423      8.079683      2.1120331
   14.360278      6.1467032    11.176002      2.435846      3.2601304
   -8.622552    -12.907682     -9.517707      3.297364      6.9596996
   -6.3065743    -9.559114     11.680233     20.759855     -0.25232798
  -23.255766    -15.243338     10.933839     15.625636      8.458781
    3.82432      -2.643291    -19.205194    -18.88522      10.794391
   23.793644      2.095411     -5.9919915     4.143923     -0.6015227
  -10.747797     -1.9943365    -5.1680174    -5.362899      4.6944914 ]
[[-165.20825     128.73132     -59.58641      19.64161      -3.2478533
     7.555708      1.2540799    24.023901    -18.398355     16.389967
    -4.5462475    -6.0966473   -11.555423      8.079683      2.1120331
    14.360278      6.1467032    11.176002      2.435846      3.2601304
    -8.622552    

array(['car_horn'], dtype='<U16')