# **Preprocess audio files**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
from IPython.display import Audio
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, LSTM, Bidirectional, GRU, BatchNormalization, LeakyReLU
from keras.utils import to_categorical
import os
import math
import json
import random

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Path to the dataset and JSON output file

DATASET_PATH = '/content/drive/My Drive/242B/genres_original'
JSON_PATH = '/content/drive/My Drive/242B/mfcc.json'
'''
DATASET_PATH = '/content/drive/My Drive/242B Final Project/Data/genres_original'
JSON_PATH = '/content/drive/My Drive/242B Final Project/Data/mfcc.json'
'''
# Audio configuration parameters
SAMPLE_RATE = 22050
DURATION = 30  # Duration in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [6]:
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=4084, hop_length=1024, num_segments=10):
    #dictionary to store data
    data = {
        'mapping' : [],
        'mfcc' : [],
        'labels' : []
    }

    count = 0
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length)


    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        if dirpath not in dataset_path:

            dirpath_components = dirpath.split('/')
            semantic_label = dirpath_components[-1]
            data['mapping'].append(semantic_label)
            print('\nProcessing {}'.format(semantic_label))

            for f in filenames:
                if f.endswith('.wav') and f != 'jazz.00054.wav': # jazz.00054.wav is an empty file

                    file_path = os.path.join(dirpath,f)

                    #loading the audio file
                    signal, sr = sf.read(file_path)
                    # len(signal) = 661794  # sr is 22050 by default

                    #process segments extracting mfcc and storing data
                    for s in range(num_segments):
                        # Since num_segments is defined as 5. Every 30 sec file is divided into 5 segments of length 6sec
                        # Start sample would keep track of the index of the first element of each 6 second batch
                        # finish sample would keep track of the index of the last element of each 6 second batch
                        # And then with the help of python's slice functionality we will extract that 6 second batch from every 30 sec signal
                        start_sample = num_samples_per_segment * s
                        finish_sample = num_samples_per_segment + start_sample

                        # we need to extract, Usually n_mfcc is set b/w 13 to 40. The other parameters n_fft and hop length are

                        mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample],sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)

                        mfcc = mfcc.T
                        if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                            print(mfcc.shape)
                            data['mfcc'].append(mfcc.tolist())
                            data['labels'].append(i)
                            print('Processing {}, segment:{}'.format(file_path, s))
                            count += 1
                            print(count)
    with open(json_path, 'w') as fp:
        json.dump(data, fp, indent=4)

In [7]:
save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:4
8324
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:5
8325
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:6
8326
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:7
8327
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:8
8328
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00042.wav, segment:9
8329
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00036.wav, segment:0
8330
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00036.wav, segment:1
8331
(65, 13)
Processing /content/drive/My Drive/242B/genres_original/reggae/reggae.00036.wav, segme

In [8]:
# loading the saved Json file
def load_data(path):
    with open(path, 'r') as fp:
        data = json.load(fp)

    #Convert lists into numpy arrays
    inputs = data['mfcc']
    targets = data['labels']
    return np.array(inputs), np.array(targets)

In [9]:
inputs, targets = load_data('/content/drive/My Drive/242B/mfcc.json')
#inputs, targets = load_data('/content/drive/My Drive/242B Final Project/Data/mfcc.json')

In [10]:
print(inputs)

[[[-8.21261387e+01  6.23185694e+01  4.08765672e+01 ... -1.11768220e+00
   -6.75119693e+00 -8.29599796e+00]
  [-6.53184250e+01  5.62508257e+01  3.39085210e+01 ... -7.61673737e-01
   -5.05098528e+00 -4.51203877e+00]
  [ 1.71854014e+01  4.35625807e+01  1.36640143e+01 ... -1.05185946e+00
   -8.01901117e+00  5.85106676e+00]
  ...
  [ 4.67913523e+01  4.54541846e+01  7.96396950e+00 ... -8.57797524e-01
   -1.08835093e+01 -2.25940695e+00]
  [ 3.30745841e+01  4.77469152e+01  1.90245913e+01 ...  6.02257979e-02
   -4.38458415e+00  1.12224241e+00]
  [ 4.00164263e+01  3.70185163e+01  2.62073849e+01 ...  2.18496714e+00
    4.28447677e+00  3.48629305e+00]]

 [[ 2.14170308e+01  5.28911029e+01  3.85110835e+01 ...  7.27950403e-01
   -3.21191446e+00 -4.06761140e+00]
  [ 4.13162442e+01  5.69841166e+01  3.89146620e+01 ...  1.69706868e+00
   -6.62406097e+00 -4.45890494e+00]
  [ 2.51622196e+01  6.08877918e+01  3.51484240e+01 ...  6.07210975e+00
   -9.16378795e+00 -6.12219623e+00]
  ...
  [ 1.64248372e+01  6.4

In [11]:
print(targets)

[ 1  1  1 ... 10 10 10]


In [12]:
inputs.shape

(9989, 65, 13)

In [13]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.2)

Add Noise

In [14]:
for i in range(inputs_train.shape[0]):
    s = np.random.rand(inputs_train.shape[1], inputs_train.shape[2])
    inputs_train[i] = inputs_train[i] + s

In [15]:
targets_train -= 1
targets_test -= 1

# **LSTM**

In [44]:
lstm_model_1 = Sequential([
    LSTM(32, return_sequences=True, input_shape=(inputs.shape[1], inputs.shape[2])),
    LSTM(64, return_sequences=True),
    LSTM(128),
    LeakyReLU(),
    Flatten(),
    Dense(128),
    LeakyReLU(),
    Dense(10, activation='softmax')
])

lstm_model_1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model_1.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 65, 32)            5888      
                                                                 
 lstm_10 (LSTM)              (None, 65, 64)            24832     
                                                                 
 lstm_11 (LSTM)              (None, 128)               98816     
                                                                 
 leaky_re_lu_6 (LeakyReLU)   (None, 128)               0         
                                                                 
 flatten_14 (Flatten)        (None, 128)               0         
                                                                 
 dense_29 (Dense)            (None, 128)               16512     
                                                                 
 leaky_re_lu_7 (LeakyReLU)   (None, 128)             

In [45]:
history = lstm_model_1.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [46]:
#add batchnormalization
lstm_model_2 = Sequential([
    LSTM(32, return_sequences=True, input_shape=(inputs.shape[1], inputs.shape[2])),
    LSTM(64, return_sequences=True),
    LSTM(128),
    LeakyReLU(),
    BatchNormalization(),
    Flatten(),
    Dense(128),
    LeakyReLU(),
    Dense(10, activation='softmax')
])

lstm_model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model_2.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 65, 32)            5888      
                                                                 
 lstm_13 (LSTM)              (None, 65, 64)            24832     
                                                                 
 lstm_14 (LSTM)              (None, 128)               98816     
                                                                 
 leaky_re_lu_8 (LeakyReLU)   (None, 128)               0         
                                                                 
 batch_normalization_13 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 flatten_15 (Flatten)        (None, 128)               0         
                                                     

In [48]:
history_2 = lstm_model_2.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
#add dropout layer
lstm_model_3 = Sequential([
    LSTM(32, return_sequences=True, input_shape=(inputs.shape[1], inputs.shape[2])),
    LSTM(64, return_sequences=True),
    LSTM(128),
    LeakyReLU(),
    Dropout(0.5),
    BatchNormalization(),
    Flatten(),
    Dense(128),
    LeakyReLU(),
    Dense(10, activation='softmax')
])

lstm_model_3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model_3.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_15 (LSTM)              (None, 65, 32)            5888      
                                                                 
 lstm_16 (LSTM)              (None, 65, 64)            24832     
                                                                 
 lstm_17 (LSTM)              (None, 128)               98816     
                                                                 
 leaky_re_lu_10 (LeakyReLU)  (None, 128)               0         
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 batch_normalization_14 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                     

In [50]:
history_3 = lstm_model_3.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# **CNN**

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

cnn_model_1 = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(inputs.shape[1], inputs.shape[2], 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

cnn_model_1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model_1.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_25 (Conv2D)          (None, 63, 11, 32)        320       
                                                                 
 max_pooling2d_25 (MaxPooli  (None, 31, 5, 32)         0         
 ng2D)                                                           
                                                                 
 conv2d_26 (Conv2D)          (None, 29, 3, 64)         18496     
                                                                 
 max_pooling2d_26 (MaxPooli  (None, 14, 1, 64)         0         
 ng2D)                                                           
                                                                 
 flatten_17 (Flatten)        (None, 896)               0         
                                                                 
 dense_35 (Dense)            (None, 128)             

In [52]:
history_4 = cnn_model_1.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [53]:
#add batchnormalization
cnn_model_2 = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(inputs.shape[1], inputs.shape[2], 1)),
    BatchNormalization(),  # Helps to normalize the inputs of the activation layer to improve training
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

cnn_model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model_2.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_27 (Conv2D)          (None, 63, 11, 32)        320       
                                                                 
 batch_normalization_15 (Ba  (None, 63, 11, 32)        128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_27 (MaxPooli  (None, 31, 5, 32)         0         
 ng2D)                                                           
                                                                 
 conv2d_28 (Conv2D)          (None, 29, 3, 64)         18496     
                                                                 
 batch_normalization_16 (Ba  (None, 29, 3, 64)         256       
 tchNormalization)                                               
                                                     

In [54]:
history_5 = cnn_model_2.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [55]:
#add dropout layer
cnn_model_3 = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(inputs.shape[1], inputs.shape[2], 1)),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Add dropout to reduce overfitting
    Dense(10, activation='softmax')
])

cnn_model_3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model_3.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_29 (Conv2D)          (None, 63, 11, 32)        320       
                                                                 
 batch_normalization_17 (Ba  (None, 63, 11, 32)        128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_29 (MaxPooli  (None, 31, 5, 32)         0         
 ng2D)                                                           
                                                                 
 conv2d_30 (Conv2D)          (None, 29, 3, 64)         18496     
                                                                 
 batch_normalization_18 (Ba  (None, 29, 3, 64)         256       
 tchNormalization)                                               
                                                     

In [56]:
history_6 = cnn_model_3.fit(inputs_train, targets_train,validation_data=(inputs_test, targets_test),epochs = 20,batch_size=80)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [59]:
data = pd.DataFrame({'Model': [ 'LSTM_1','LSTM_2_BN','LSTM_3_BN_DROPOUT','CNN_1', 'CNN_2_BN', 'CNN_3_BN_DROPOUT'],
        'Test Accuracy': [history.history['val_accuracy'][-1], history_2.history['val_accuracy'][-1], history_3.history['val_accuracy'][-1],
                          history_4.history['val_accuracy'][-1],history_5.history['val_accuracy'][-1],history_6.history['val_accuracy'][-1]],
         'Test Loss': [history.history['val_loss'][-1], history_2.history['val_loss'][-1], history_3.history['val_loss'][-1],
                          history_4.history['val_loss'][-1],history_5.history['val_loss'][-1],history_6.history['val_loss'][-1]]        })
data

Unnamed: 0,Model,Test Accuracy,Test Loss
0,LSTM_1,0.70971,0.895483
1,LSTM_2_BN,0.740741,0.906434
2,LSTM_3_BN_DROPOUT,0.693694,0.940629
3,CNN_1,0.662162,1.458174
4,CNN_2_BN,0.711211,1.375765
5,CNN_3_BN_DROPOUT,0.733233,0.909082


In [60]:
#classification report for the best lstm and cnn
predictions_lstm = lstm_model_2.predict(inputs_test)
predicted_classes_lstm = predictions_lstm.argmax(axis=1)

predictions_cnn = cnn_model_3.predict(inputs_test)
predicted_classes_cnn = predictions_cnn.argmax(axis=1)

true_classes = targets_test


from sklearn.metrics import classification_report
report_lstm = classification_report(true_classes, predicted_classes_lstm)
print("Classification Report for LSTM Model:")
print(report_lstm)
report_cnn = classification_report(true_classes, predicted_classes_cnn)
print("\nClassification Report for CNN Model:")
print(report_cnn)


Classification Report for LSTM Model:
              precision    recall  f1-score   support

           0       0.65      0.80      0.71       210
           1       0.86      0.95      0.90       216
           2       0.72      0.69      0.70       210
           3       0.90      0.85      0.87       170
           4       0.65      0.50      0.56       199
           5       0.87      0.66      0.75       216
           6       0.84      0.77      0.80       208
           7       0.81      0.76      0.78       188
           8       0.67      0.72      0.70       191
           9       0.55      0.72      0.62       190

    accuracy                           0.74      1998
   macro avg       0.75      0.74      0.74      1998
weighted avg       0.75      0.74      0.74      1998


Classification Report for CNN Model:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       210
           1       0.93      0.90      0.92       216
  