In [1]:
# !pip install librosa
# !pip install tqdm

In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import librosa
import librosa.display
import pandas as pd
import os
import numpy as np
import tqdm as tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten,Input
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

## Data preprocessing

In [6]:
def features_extractor(file):
    data,sample_rate = librosa.load(file)
    mfccs_features = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features=np.mean(mfccs_features.T, axis=0)

    return mfccs_scaled_features

In [7]:
extracted_features=[]

for index_num, row in tqdm.tqdm((metadata.iterrows())):
    file= os.path.join(os.path.abspath(audio_dataset),'fold'+str(row["fold"])+'/',str (row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file)
    extracted_features.append([data,final_class_labels])

8732it [02:43, 53.34it/s]


In [10]:
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])

In [11]:
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [12]:
labelencoder = LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Model Creation

In [25]:
num_labels=y.shape[1]

In [26]:
model=Sequential()
model.add(Input(shape=(40,)))

model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [27]:
model.summary()

In [28]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'],optimizer='adam')

In [29]:
num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.keras', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1161 - loss: 23.6143
Epoch 1: val_loss improved from inf to 2.25927, saving model to saved_models/audio_classification.keras
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.1161 - loss: 23.5409 - val_accuracy: 0.1434 - val_loss: 2.2593
Epoch 2/100
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1228 - loss: 3.8874
Epoch 2: val_loss did not improve from 2.25927
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1228 - loss: 3.8838 - val_accuracy: 0.1175 - val_loss: 2.2830
Epoch 3/100
[1m122/137[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.1327 - loss: 2.6014
Epoch 3: val_loss did not improve from 2.25927
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1328 - loss: 2.5943 - val_accuracy: 0.1205 - v

In [30]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.7430142164230347


In [31]:
filename = "UrbanSound8K/9031-3-1-0.wav"
mfccs_scaled_features = features_extractor(filename).reshape(1, -1)

# Predict the class probabilities
predicted_label = model.predict(mfccs_scaled_features)

# Convert probabilities to class index
predicted_class_index = np.argmax(predicted_label, axis=1)

# Inverse transform to get original class label
prediction_class = labelencoder.inverse_transform(predicted_class_index)
print(prediction_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
['gun_shot']
