# Plan
Five models. Each one is hypertuned with respect to learning rate, unit number, optimizer, batch size, epoch, data normalization, dropout?, dropout rate, activation choice, 

The four models are 
- 1 mlp model
- 1 vgg16 model
- 1 yamnet
- 1 LSTM
- a hybrid of yamnet and LSTM

The feature used
- MFCCs for mlp
- Mel Spectrogram for vgg16
- raw audio waves for sound models

Tuned hyper parameters
- learning rate
- unit number
- structure, e.g. whether to contain one more layer, whether to contain dropout, whether to do data normalization
- dropout rate
- activation choice
- batch size
- epochs

# One MLP model as baseline model
three layers

# Corresponding to MLPs section from the document

- A baseline model
- Hypertuning of the baseline model
- A more complicated MLP model
- Some other experimenting models with 30-second length audio or 3-second length audio

In [1]:
!pip install keras-tuner -q
import os
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
import keras_tuner
from tensorflow.keras import layers
import librosa

from tensorflow.keras.utils import to_categorical, plot_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten, Conv2D,MaxPooling2D
from tensorflow.keras.optimizers import Adam

from sklearn import metrics

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from datetime import datetime 
from sklearn.model_selection import train_test_split

# for manipulate the mel spectrographs
from PIL import Image

import tensorflow_hub as hub

from google.colab import drive
drive.mount('/content/drive')


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/176.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


## Load Data

## Data Exploration

In [2]:
# the root of the data
dataRoot = "drive/MyDrive/DeepLearningProject/datasets/Data/"

In [3]:
'''
the pre-extracted MFCCs that comes with the dataset. 
One is of lengh 3-second.
'''
feturefile = dataRoot+"features_3_sec.csv"

In [4]:
# read them into memory
mfccDF = pd.read_csv(feturefile)

In [5]:
mfccDF.shape

(9990, 60)

In [6]:
mfccDF.columns

Index(['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
       'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
       'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
       'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
       'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
       'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
       'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
       'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
       'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var',
  

In [7]:
mfccDF.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [8]:
# fileNames = mfcc3Df['filename']

In [9]:
mfccFeatures = mfccDF.iloc[:, 2:-1]

In [10]:
mfccFeatures.shape

(9990, 57)

In [11]:
mfccLabels = mfccDF['label']

In [12]:
# balanced dataset
mfccLabels.value_counts()

blues        1000
jazz         1000
metal        1000
pop          1000
reggae       1000
disco         999
classical     998
hiphop        998
rock          998
country       997
Name: label, dtype: int64

In [13]:
X_mfcc = np.array(mfccFeatures)

In [14]:
scaler = StandardScaler()
X_mfcc_scaled = scaler.fit_transform(X_mfcc)

In [15]:
### Label Encoding
y_mfcc = np.array(pd.get_dummies(mfccLabels))

In [16]:
X_mfcc.shape

(9990, 57)

In [17]:
X_mfcc_train, X_mfcc_test, y_mfcc_train, y_mfcc_test = train_test_split(X_mfcc_scaled, y_mfcc, test_size=0.2, random_state=0)

In [18]:
X_mfcc_train.shape

(7992, 57)

In [19]:
X_mfcc_test.shape

(1998, 57)

In [20]:
y_mfcc_train.shape

(7992, 10)

In [21]:
y_mfcc_test.shape

(1998, 10)

### Model Creation

In [22]:
### Number of classes
num_labels=y_mfcc.shape[1]

In [23]:
X_mfcc_train.shape[1]

57

In [24]:
X_mfcc_train.shape[1]

57

In [25]:
class MLPHyperModel(keras_tuner.HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        activatoin = hp.Choice("activation", ["relu", "tanh"])

        model.add(Dense(
              units = hp.Int("units_1", 64, 1024, step = 64),
              activation = activatoin)
        )
        if hp.Boolean("dropout_1"):
            model.add(Dropout(0.25))
        model.add(Dense(
              units = hp.Int("units_2", 32, 1024, step = 64),
              activation = activatoin)
        )
        if hp.Boolean("dropout_2"):
            model.add(Dropout(0.25))
        model.add(Dense(
              units = hp.Int("units_3", 32, 512, step = 64),
              activation = activatoin)
        )
        if hp.Boolean("dropout_3"):
            model.add(Dropout(0.25))
        model.add(Dense(num_labels, activation = "softmax"))  

        # Define the optimizer learning rate as a hyperparameter.
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])

        # Compile the model.
        model.compile(
            loss="categorical_crossentropy", metrics=["accuracy"], optimizer = Adam(learning_rate=learning_rate),
        )

        return model

    def fit(self, hp, model, X, y, **kwargs):

        return model.fit(
            X,
            y,
            validation_split=0.2,
            batch_size = hp.Int("batch_size", 4,64,step=8),
            **kwargs,
        )   
        
mlp_tuner = keras_tuner.RandomSearch(
    MLPHyperModel(),
    objective="val_accuracy",
    # overwrite=True,
    executions_per_trial = 2,
    max_trials=30,
    directory="mlp_tuner",
    project_name='mlp'
)

print(mlp_tuner.search_space_summary())

## uncomment this
# mlp_tuner.search(X_mfcc_train, y_mfcc_train, epochs=5)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
mlp_tuner.search(X_mfcc_train, y_mfcc_train,callbacks = [keras.callbacks.TensorBoard("mlp_tuner")], epochs = 5)

Trial 30 Complete [00h 00m 23s]
val_accuracy: 0.7889305949211121

Best val_accuracy So Far: 0.8533458411693573
Total elapsed time: 00h 19m 46s


In [26]:
# %load_ext tensorboard
# %tensorboard --logdir mlp_tuner

In [27]:
mlp_hp_model = MLPHyperModel()
best_mlp_hp = mlp_tuner.get_best_hyperparameters()[0]
best_mlp_model = mlp_hp_model.build(best_mlp_hp)

# baseline_model.fit(X_3mfcc_train, y_3mfcc_train, validation_split=0.2, batch_size=num_batch_size, epochs=num_epochs, callbacks=[early_stop], verbose=1)

early_stop = EarlyStopping(monitor='val_loss', patience=5)
mlp_hp_model.fit(best_mlp_hp, best_mlp_model, X_mfcc_train, y_mfcc_train, verbose=1, epochs = 50, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.callbacks.History at 0x7f52a10a9900>

In [28]:
best_mlp_model.save_weights("best_mlp_model.h5")

In [29]:
best_mlp_hp.values

{'activation': 'relu',
 'units_1': 448,
 'dropout_1': False,
 'units_2': 672,
 'dropout_2': False,
 'units_3': 32,
 'dropout_3': False,
 'learning_rate': 0.001,
 'batch_size': 12}

In [30]:
# X_mfcc_test
test_accuracy = best_mlp_model.evaluate(X_mfcc_test,y_mfcc_test,verbose=0)
print(test_accuracy[1])

0.8893893957138062


# Summary for MLP

Best accuracy is 88.9% on the best following hyperparameters
```python
{
 'activation': 'relu', 'units_1': 448, 'dropout_1': False,
 'units_2': 672, 'dropout_2': False, 'units_3': 32,
 'dropout_3': False, 'learning_rate': 0.001, 'batch_size': 12,
 'epochs': 11
 }
