# Audio Classification Using Machine Learning

### Importing the libraries

In [236]:
import pandas as pd
import os
import librosa
import numpy as np
from tqdm import tqdm
import tensorflow as tf

## Part 1 - Data Preprocessing

### Importing the dataset

In [237]:
audio_dataset_path='UrbanSound8K/audio/'
metadata=pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


### Extract Features

<small>Here we will be using Mel-Frequency Cepstral Coefficients(MFCC) from the audio samples. The MFCC summarises the frequency distribution across the window size, so it is possible to analyse both the frequency and time characteristics of the sound. These audio representations will allow us to identify features for classification.</small>

In [238]:
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name) 
    mfccs_features = librosa.feature.mfcc(y = audio, sr = sample_rate, n_mfcc = 40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis = 0)
    
    return mfccs_scaled_features

In [239]:
# Iterate through every audio file and extract features using Mel-Frequency Cepstral Coefficients
extracted_features = []
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]) + '/', str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name)
    extracted_features.append([data, final_class_labels])

8732it [04:09, 34.97it/s]


In [240]:
# Converting extracted_features to Pandas dataframe
extracted_features_df = pd.DataFrame(extracted_features, columns = ['feature', 'class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-211.93698, 62.581207, -122.81315, -60.74528,...",dog_bark
1,"[-417.0052, 99.336624, -42.995586, 51.073326, ...",children_playing
2,"[-452.39316, 112.36253, -37.578068, 43.195866,...",children_playing
3,"[-406.47922, 91.1966, -25.043556, 42.78452, 11...",children_playing
4,"[-439.63873, 103.86223, -42.658787, 50.690277,...",children_playing


In [292]:
# Split the dataset into independent and dependent dataset
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

### Encoding categorical data

#### Encoding the Dependent Variable

In [308]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = to_categorical(labelencoder.fit_transform(y))

In [309]:
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


### Splitting the dataset into the Training set and Test set

In [310]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

### Feature Scaling

In [311]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Part 2 - Building the ANN

### Initializing the ANN

In [321]:
ann = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [322]:
ann.add(tf.keras.layers.Dense(units = 100, input_shape = (40,), activation = 'relu'))
ann.add(tf.keras.layers.Dropout(0.5))                   #prevent overfitting, which occurs when a model learns to memorize the training data instead of learning to generalize well to unseen data

### Adding the second hidden layer

In [323]:
ann.add(tf.keras.layers.Dense(units = 200, activation = 'relu'))
ann.add(tf.keras.layers.Dropout(0.5))

### Adding the third hidden layer

In [324]:
ann.add(tf.keras.layers.Dense(units = 100, activation = 'relu'))
ann.add(tf.keras.layers.Dropout(0.5))

### Adding the output layer

In [325]:
ann.add(tf.keras.layers.Dense(units = 10, activation = 'softmax'))

In [326]:
ann.summary()

## Part 3 - Training the ANN

### Compiling the ANN

In [334]:
ann.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

### Training the ANN on the Training set

In [335]:
num_epochs = 100
num_batch_size = 32

ann.fit(X_train, y_train, batch_size = num_batch_size, epochs = num_epochs, validation_data = (X_test, y_test), verbose = 1)

Epoch 1/100


[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.1497 - loss: 2.3561 - val_accuracy: 0.4219 - val_loss: 1.8222
Epoch 2/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3400 - loss: 1.8972 - val_accuracy: 0.5232 - val_loss: 1.4853
Epoch 3/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4239 - loss: 1.6472 - val_accuracy: 0.5764 - val_loss: 1.3258
Epoch 4/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4688 - loss: 1.5264 - val_accuracy: 0.6153 - val_loss: 1.2315
Epoch 5/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5122 - loss: 1.4433 - val_accuracy: 0.6440 - val_loss: 1.1505
Epoch 6/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5394 - loss: 1.3655 - val_accuracy: 0.6657 - val_loss: 1.0833
Epoch 7/100
[1m219/219[0m [32m━

<keras.src.callbacks.history.History at 0x16febed0f50>

## Part 4 - Making the predictions and evaluating the model

### Predicting the Test set results

In [379]:
no_of_datasets = X_test.shape[0]
predicted_probabilities = ann.predict(X_test)
predicted_label_index_array = []
actual_label_index_array = []

for i in range(no_of_datasets):
    predicted_label_index = np.argmax(predicted_probabilities[i])
    predicted_label_index_array.append(predicted_label_index)

    actual_label_index = np.argmax(y_test[i])
    actual_label_index_array.append(actual_label_index)

predicted_label_index_array =  np.array(predicted_label_index_array)
actual_label_index_array =  np.array(actual_label_index_array)
print(np.concatenate((predicted_label_index_array.reshape(len(predicted_label_index_array),1), actual_label_index_array.reshape(len(actual_label_index_array),1)),1))

[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[5 5]
 [9 1]
 [4 4]
 ...
 [1 1]
 [2 2]
 [2 2]]


In [387]:
predicted_label_index_array = predicted_label_index_array.reshape(len(predicted_label_index_array),1)
actual_label_index_array = actual_label_index_array.reshape(len(actual_label_index_array),1)

### Making the Confusion Matrix

In [388]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(actual_label_index_array, predicted_label_index_array)
print(cm)
accuracy_score(actual_label_index_array, predicted_label_index_array)

[[193   0   0   1   0   0   0   0   0   1]
 [  0  71   2   1   1   2   0   2   0  12]
 [  6   1 165   8   1   5   1   2   0  16]
 [  7   0  12 136   4   3   4   0   3  13]
 [  0   0   2   0 173   0   0   9   0  18]
 [  2   0   5   1   1 202   0   0   2   3]
 [  0   0   4   4   2   0  72   0   1   4]
 [  1   0   0   0   4   0   0 174   0   8]
 [  2   0   5   3   0   2   0   0 185   2]
 [  2   1  25   2   5   4   0   5   0 139]]


0.864338866628506