# Plan
Five models. Each one is hypertuned with respect to learning rate, unit number, optimizer, batch size, epoch, data normalization, dropout?, dropout rate, activation choice, 

The four models are 
- 1 mlp model
- 1 vgg16 model
- 1 yamnet
- 1 LSTM
- a hybrid of yamnet and LSTM

The feature used
- MFCCs for mlp
- Mel Spectrogram for vgg16
- raw audio waves for sound models

Tuned hyper parameters
- learning rate
- unit number
- structure, e.g. whether to contain one more layer, whether to contain dropout, whether to do data normalization
- dropout rate
- activation choice
- batch size
- epochs

In [1]:
!pip install keras-tuner -q
import os
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
import keras_tuner
from tensorflow.keras import layers
import librosa

from tensorflow.keras.utils import to_categorical, plot_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten, Conv2D,MaxPooling2D,LSTM
from tensorflow.keras.optimizers import Adam

from sklearn import metrics

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from datetime import datetime 
from sklearn.model_selection import train_test_split

# for manipulate the mel spectrographs
from PIL import Image

import tensorflow_hub as hub

from google.colab import drive
drive.mount('/content/drive')


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


## Load Data

## Data Exploration

In [2]:
# the root of the data
dataRoot = "drive/MyDrive/DeepLearningProject/datasets/Data/"

# LSTM with MFCCs

### Load the audio wave files

In [3]:
'''
@fileName - file name of a music
return - the label
e.g. label = extract_label("blues0000.png")
'''
def extract_audio_label(fileName):
  import re

  match = re.search(r'([a-zA-Z ]+).(\d+)', fileName)

  if match:
      # text = match.group(1).strip()
      text = match.group(1)
  else:
    raise ValueError("Failed to extracte labels from Music file name, "+fileName)

  return text
extract_audio_label("blues0000.png")

'blues'

In [4]:
'''
Load audio wave files
'''

# Define the root directory
root_dir = os.path.join(dataRoot, "genres_original")

# Load the images and labels
waves = []
labels = []

# Iterate through all directories under the root directory
for dirpath, dirnames, filenames in os.walk(root_dir):
    # dirpath is the path of the current directory
    # dirnames is a list of subdirectories in the current directory
    # filenames is a list of files in the current directory
    
    # Do something with the directory path, such as print it
    
    # print(dirnames)
    
    for fname in filenames:
      # deal with the music
      musicPath = os.path.join(dirpath,fname)
      # when have an exception, do not use this sample
      try:
        data,sample_rate=librosa.load(musicPath, sr=16000)
        waves.append(data)
        # get its label
        labels.append(extract_audio_label(fname))
      except Exception as e:
        print("Error with ",fname, " ",e)
    

# Convert the lists to NumPy arrays
waves = np.array(waves)
labels = np.array(labels)

  data,sample_rate=librosa.load(musicPath, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with  jazz.00054.wav   


  waves = np.array(waves)


## Make a 10 times larger dataset by extracting 10 3-second sub waves from the original 30-second wave

In [5]:
def Split(audio, sr=16000):
  # Get number of samples for 2 seconds; replace 2 by any number
  buffer = 3 * sr

  samples_total = len(audio)
  samples_wrote = 0

  splits = []
  while samples_wrote < samples_total:

      #check if the buffer is not exceeding total samples 
      if buffer > (samples_total - samples_wrote):
          buffer = samples_total - samples_wrote

      block = audio[samples_wrote : (samples_wrote + buffer)]
      samples_wrote += buffer
      splits.append(block)
  return splits

In [6]:
waves_3sec = []
labels_3sec = []
for wave, label in zip(waves, labels):
  splits = []
  splits = Split(wave)
  for split in splits:
    waves_3sec.append(split)
    labels_3sec.append(label)

waves_3sec = np.array(waves_3sec)
labels_3sec = np.array(labels_3sec)

  waves_3sec = np.array(waves_3sec)


In [7]:
labels_3sec.shape

(10979,)

In [8]:
waves_3sec.shape

(10979,)

In [9]:
waves_3sec[0].shape

(48000,)

### get rid of those of different shapes

In [10]:
# first count how many of them have different shapes
count = 0
for wave in waves_3sec:
  if wave.shape[0] != 48000:
    count+=1
print(count)
# only 1/10 of them are of other shapes, we can safely remove them without worrying about the balance of dataset

998


In [12]:
# check the distribution of the corrupted data
distribution_corrupted = {}
for i in range(waves_3sec.shape[0]):
  if waves_3sec[i].shape[0] != 48000:
    if labels_3sec[i] in distribution_corrupted:
      distribution_corrupted[labels_3sec[i]] += 1
    else:
      distribution_corrupted[labels_3sec[i]] = 1
print(distribution_corrupted)
# they are equally distributed thus safe to delete

{'blues': 100, 'reggae': 100, 'classical': 100, 'hiphop': 100, 'jazz': 99, 'rock': 99, 'pop': 100, 'disco': 100, 'country': 100, 'metal': 100}


In [None]:
waves_3sec_clear = []
labels_3sec_clear = []

for wave,label in zip(waves_3sec, labels_3sec):
  if wave.shape[0] == 48000:
    waves_3sec_clear.append(wave)
    labels_3sec_clear.append(label)
    
waves_3sec_clear = np.array(waves_3sec_clear)
labels_3sec_clear = np.array(labels_3sec_clear)

In [None]:
waves_3sec_clear.shape

(9981, 48000)

In [None]:
labels_3sec_clear.shape

(9981,)

In [None]:
waves_raw = waves_3sec_clear
labels_raw = labels_3sec_clear

In [None]:
waves_raw.shape

(9981, 48000)

### One note, when taking the a portion to do the hypertuning, shuffle the dataset.

### Get MFCCs features

In [None]:
import librosa

mfcc_features = []

for wave in waves_raw:
  # Extract MFCCs
  mfccs = librosa.feature.mfcc(y=wave, sr=16000, n_mfcc=13)
  mfcc_features.append(mfccs.T)

In [None]:
mfcc_features = np.array(mfcc_features)

In [None]:
mfcc_features.shape

(9981, 94, 13)

In [None]:
mfcc_features[0][0][0]

-64.33838

In [None]:
# encode the labels for large dataset
labelDf_large = pd.DataFrame({"label":labels_3sec_clear})
labelDf_large['label'].value_counts()
label_dict_large = {}
code = 0
for _label in labelDf_large['label'].value_counts().index:
  label_dict_large[_label] = code
  code += 1

labels_large_encoded = []
labels_large = labelDf_large.values
for _l in labels_large:
  labels_large_encoded.append(label_dict_large[_l[0]])
labels_large_encoded = np.array(labels_large_encoded)

labels_large_encoded_ct = to_categorical(labels_large_encoded)
labels_large_encoded_ct.shape

(9981, 10)

In [None]:
class LSTMHyperModel(keras_tuner.HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        # activatoin = hp.Choice("activation", ["relu", "tanh"])
        activatoin = "tanh"

        model.add(LSTM(
              units = hp.Int("units_1", 32, 512, step = 32),
              activation = activatoin,
              input_shape=(mfcc_features.shape[1], mfcc_features.shape[2]), 
              return_sequences=True)
        )

        model.add(LSTM(
              units = hp.Int("units_2", 32, 256, step = 32),
              activation = activatoin,
              return_sequences=True)
        )
        
        if hp.Boolean("dropout_2"):
            model.add(Dropout(0.25))

        model.add(LSTM(
              units = hp.Int("units_3", 16, 128, step = 32),
              activation = activatoin)
        )

        model.add(Dropout(hp.Float("dropout_rate2", 0.1, 0.26, step = 0.05)))

        model.add(Dense(10, activation = "softmax"))  

        # Define the optimizer learning rate as a hyperparameter.
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])

        # Compile the model.
        model.compile(
            loss="categorical_crossentropy", metrics=["accuracy"], optimizer = Adam(learning_rate=learning_rate),
        )

        return model

    def fit(self, hp, model, X, y, **kwargs):

        return model.fit(
            X,
            y,
            validation_split=0.2,
            batch_size = hp.Int("batch_size", 4, 64,step = 8),
            **kwargs,
        )   
        
tuner_lstm = keras_tuner.RandomSearch(
    LSTMHyperModel(),
    objective="val_accuracy",
    overwrite=True,
    executions_per_trial=2,
    max_trials=10,
    directory="lstm_tuner",
    project_name='lstm'
)

print(tuner_lstm.search_space_summary())

Search space summary
Default search space size: 6
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': 'linear'}
dropout_2 (Boolean)
{'default': False, 'conditions': []}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 16, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
dropout_rate2 (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.26, 'step': 0.05, 'sampling': 'linear'}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}
None


In [None]:
X_train,X_test,y_train,y_test = train_test_split(mfcc_features,labels_large_encoded_ct,test_size=0.2,random_state=0)

In [None]:
mfcc_features.shape

(9981, 94, 13)

In [None]:
# # a quick test
# hp = keras_tuner.HyperParameters()
# hypermodel = LSTMHyperModel()
# model = hypermodel.build(hp)
# hypermodel.fit(hp, model, X_train, y_train, epochs = 5)

In [None]:
# early_stop = EarlyStopping(monitor='val_loss', patience=3)

# uncomment this
# tuner_lstm.search(X_train, y_train,callbacks = [keras.callbacks.TensorBoard("lstm_tuner")], epochs = 5)
# uncomment this
tuner_lstm.search(X_train, y_train, epochs = 5, callbacks = [keras.callbacks.TensorBoard("lstm_tuner")])

Trial 10 Complete [00h 00m 42s]
val_accuracy: 0.4038822799921036

Best val_accuracy So Far: 0.5961177051067352
Total elapsed time: 00h 15m 22s


In [None]:
# %load_ext tensorboard
# %tensorboard --logdir mlp_tuner

In [None]:
lstm_hp_model = LSTMHyperModel()
best_lstm_hp = tuner_lstm.get_best_hyperparameters()[0]
best_lstm_model = lstm_hp_model.build(best_lstm_hp)

early_stop = EarlyStopping(monitor='val_loss', patience=5)
lstm_hp_model.fit(best_lstm_hp, best_lstm_model, X_train, y_train, verbose=1, epochs = 50, callbacks = [early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


<keras.callbacks.History at 0x7f4ec1832fe0>

In [None]:
best_lstm_model.save_weights("best_lstm_model.h5")

In [None]:
# Accuracy on the test set
test_accuracy_lstm = best_lstm_model.evaluate(X_test,y_test,verbose=1)
print(test_accuracy_lstm[1])

0.7731597423553467


In [None]:
best_lstm_hp.values

{'units_1': 416,
 'units_2': 32,
 'dropout_2': True,
 'units_3': 48,
 'dropout_rate2': 0.2,
 'learning_rate': 0.001,
 'batch_size': 12}

# LSTM Summary
Best accuracy, *77.3%*, at epochs of 10 on,
```python
{
 'units_1': 416, 'units_2': 32, 'dropout_2': True,
 'units_3': 48, 'dropout_rate2': 0.2, 'learning_rate': 0.001,
 'batch_size': 12, 'epochs': 16
 }
```