# Plan
Five models. Each one is hypertuned with respect to learning rate, unit number, optimizer, batch size, epoch, data normalization, dropout?, dropout rate, activation choice, 

The four models are 
- 1 mlp model
- 1 vgg16 model
- 1 yamnet
- 1 LSTM
- a hybrid of yamnet and LSTM

The feature used
- MFCCs for mlp
- Mel Spectrogram for vgg16
- raw audio waves for sound models

Tuned hyper parameters
- learning rate
- unit number
- structure, e.g. whether to contain one more layer, whether to contain dropout, whether to do data normalization
- dropout rate
- activation choice
- batch size
- epochs

In [1]:
!pip install keras-tuner -q
import os
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
import keras_tuner
from tensorflow.keras import layers
import librosa

from tensorflow.keras.utils import to_categorical, plot_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten, Conv2D,MaxPooling2D
from tensorflow.keras.optimizers import Adam

from sklearn import metrics

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from datetime import datetime 
from sklearn.model_selection import train_test_split

# for manipulate the mel spectrographs
from PIL import Image

import tensorflow_hub as hub

from google.colab import drive
drive.mount('/content/drive')


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/176.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


## Load Data

## Data Exploration

In [2]:
# the root of the data
dataRoot = "drive/MyDrive/DeepLearningProject/datasets/Data/"

# YAMNet

### Load the audio wave files

In [3]:
'''
@fileName - file name of a music
return - the label
e.g. label = extract_label("blues0000.png")
'''
def extract_audio_label(fileName):
  import re

  match = re.search(r'([a-zA-Z ]+).(\d+)', fileName)

  if match:
      # text = match.group(1).strip()
      text = match.group(1)
  else:
    raise ValueError("Failed to extracte labels from Music file name, "+fileName)

  return text
extract_audio_label("blues0000.png")

'blues'

In [4]:
'''
Load audio wave files
'''

# Define the root directory
root_dir = os.path.join(dataRoot, "genres_original")

# Load the images and labels
waves = []
labels = []

# Iterate through all directories under the root directory
for dirpath, dirnames, filenames in os.walk(root_dir):
    # dirpath is the path of the current directory
    # dirnames is a list of subdirectories in the current directory
    # filenames is a list of files in the current directory
    
    # Do something with the directory path, such as print it
    
    # print(dirnames)
    
    for fname in filenames:
      # deal with the music
      musicPath = os.path.join(dirpath,fname)
      # when have an exception, do not use this sample
      try:
        data,sample_rate=librosa.load(musicPath, sr=16000)
        waves.append(data)
        # get its label
        labels.append(extract_audio_label(fname))
      except Exception as e:
        print("Error with ",fname, " ",e)
    

# Convert the lists to NumPy arrays
waves = np.array(waves)
labels = np.array(labels)

  data,sample_rate=librosa.load(musicPath, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with  jazz.00054.wav   


  waves = np.array(waves)


In [5]:
labels.shape

(999,)

In [6]:
waves.shape

(999,)

In [7]:
waves[0].shape

(480214,)

## Make a 10 times larger dataset by extracting 10 3-second sub waves from the original 30-second wave

In [8]:
def Split(audio, sr=16000):
  # Get number of samples for 2 seconds; replace 2 by any number
  buffer = 3 * sr

  samples_total = len(audio)
  samples_wrote = 0

  splits = []
  while samples_wrote < samples_total:

      #check if the buffer is not exceeding total samples 
      if buffer > (samples_total - samples_wrote):
          buffer = samples_total - samples_wrote

      block = audio[samples_wrote : (samples_wrote + buffer)]
      samples_wrote += buffer
      splits.append(block)
  return splits

In [9]:
waves_3sec = []
labels_3sec = []
for wave, label in zip(waves, labels):
  splits = []
  splits = Split(wave)
  for split in splits:
    waves_3sec.append(split)
    labels_3sec.append(label)

waves_3sec = np.array(waves_3sec)
labels_3sec = np.array(labels_3sec)

  waves_3sec = np.array(waves_3sec)


In [10]:
labels_3sec.shape

(10979,)

In [11]:
waves_3sec.shape

(10979,)

In [12]:
waves_3sec[0].shape

(48000,)

### get rid of those of different shapes

In [13]:
# first count how many of them have different shapes
count = 0
for wave in waves_3sec:
  if wave.shape[0] != 48000:
    count+=1
print(count)
# only 1/10 of them are of other shapes, we can safely remove them without worrying about the balance of dataset

998


In [14]:
waves_3sec_clear = []
labels_3sec_clear = []

for wave,label in zip(waves_3sec, labels_3sec):
  if wave.shape[0] == 48000:
    waves_3sec_clear.append(wave)
    labels_3sec_clear.append(label)
    
waves_3sec_clear = np.array(waves_3sec_clear)
labels_3sec_clear = np.array(labels_3sec_clear)

In [15]:
waves_3sec_clear.shape

(9981, 48000)

In [16]:
labels_3sec_clear.shape

(9981,)

In [17]:
labels_3sec_clear

array(['blues', 'blues', 'blues', ..., 'metal', 'metal', 'metal'],
      dtype='<U9')

In [18]:
waves_raw = waves_3sec_clear
labels_raw = labels_3sec_clear

### One note, when taking the a portion to do the hypertuning, shuffle the dataset.

# YAMNet connected to a MLP end

In [19]:
# load the pretrained model
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

In [20]:
# make inference to extract yamnet features
yamnet_features = []
for wave in waves_raw:
  _,embeddings,_ = yamnet_model(wave)
  yamnet_features.append(embeddings)

In [21]:
yamnet_features = np.array(yamnet_features)

In [22]:
yamnet_features.shape

(9981, 6, 1024)

In [23]:
# since some of the data are of shape (63,1024) and some of that are (62,1024). I must truncate them all into (62,1024)
yamnet_features_fixed = []
# fix the inconsistant data
for row in yamnet_features:
  yamnet_features_fixed.append(row[:62])
yamnet_features_fixed = np.array(yamnet_features_fixed)

In [24]:
# summarize the feature by taking mean and std
yam_summary_fea = []
for row in yamnet_features:
  feature = []
  feature.extend(np.mean(row, axis=0))
  feature.extend(np.std(row, axis=0))
  yam_summary_fea.append(feature)
yam_summary_fea = np.array(yam_summary_fea)

In [25]:
yam_summary_fea.shape

(9981, 2048)

In [26]:
# encode the labels for large dataset
labelDf_large = pd.DataFrame({"label":labels_3sec_clear})
labelDf_large['label'].value_counts()
label_dict_large = {}
code = 0
for _label in labelDf_large['label'].value_counts().index:
  label_dict_large[_label] = code
  code += 1

labels_large_encoded = []
labels_large = labelDf_large.values
for _l in labels_large:
  labels_large_encoded.append(label_dict_large[_l[0]])
labels_large_encoded = np.array(labels_large_encoded)

labels_large_encoded_ct = to_categorical(labels_large_encoded)
labels_large_encoded_ct.shape

(9981, 10)

In [27]:
class YAMNetHyperModel(keras_tuner.HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        activatoin = hp.Choice("activation", ["relu", "tanh"])

        model.add(Dense(
              units = hp.Int("units_1", 256, 1024, step = 256),
              activation = activatoin)
        )
        model.add(Dense(
              units = hp.Int("units_2", 128, 512, step = 128),
              activation = activatoin)
        )
        model.add(Dense(
              units = hp.Int("units_3", 64, 256, step = 64),
              activation = activatoin)
        )

        if hp.Boolean("dropout"):
            model.add(Dropout(0.25))

        model.add(Dense(10, activation = "softmax"))  

        # Define the optimizer learning rate as a hyperparameter.
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])

        # Compile the model.
        model.compile(
            loss="categorical_crossentropy", metrics=["accuracy"], optimizer = Adam(learning_rate=learning_rate),
        )

        return model

    def fit(self, hp, model, X, y, **kwargs):

        return model.fit(
            X,
            y,
            validation_split=0.2,
            batch_size = hp.Int("batch_size", 4, 64,step = 8),
            **kwargs,
        )   
        
tuner_yamnet = keras_tuner.RandomSearch(
    YAMNetHyperModel(),
    objective="val_accuracy",
    # overwrite=True,
    executions_per_trial=1,
    max_trials=20,
    directory="yamnet_tuner",
    project_name='yamnet'
)

print(tuner_yamnet.search_space_summary())

Search space summary
Default search space size: 6
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 256, 'max_value': 1024, 'step': 256, 'sampling': 'linear'}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 512, 'step': 128, 'sampling': 'linear'}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': 'linear'}
dropout (Boolean)
{'default': False, 'conditions': []}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}
None


In [28]:
X_yam_train,X_yam_test,y_yam_train,y_yam_test = train_test_split(yam_summary_fea,labels_large_encoded_ct,test_size=0.2,random_state=0)

In [29]:
# early_stop = EarlyStopping(monitor='val_loss', patience=3)

# uncomment this
tuner_yamnet.search(X_yam_train, y_yam_train,callbacks = [keras.callbacks.TensorBoard("yamnet_tuner")], epochs = 5)

Trial 10 Complete [00h 00m 12s]
val_accuracy: 0.651847243309021

Best val_accuracy So Far: 0.8353162407875061
Total elapsed time: 00h 04m 50s


In [30]:
# %load_ext tensorboard
# %tensorboard --logdir mlp_tuner

In [31]:
yamnet_hp_model = YAMNetHyperModel()
best_yamnet_hp = tuner_yamnet.get_best_hyperparameters()[0]
best_yamnet_model = yamnet_hp_model.build(best_yamnet_hp)

early_stop = EarlyStopping(monitor='val_loss', patience=3)
yamnet_hp_model.fit(best_yamnet_hp, best_yamnet_model, X_yam_train, y_yam_train, verbose=1, epochs = 15, callbacks=[early_stop])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


<keras.callbacks.History at 0x7fb12ff1e2c0>

In [32]:
best_yamnet_model.save_weights('best_yamnet_model.h5')

In [33]:
# Accuracy on the test set
test_accuracy_yamnet = best_yamnet_model.evaluate(X_yam_test,y_yam_test,verbose=0)
print(test_accuracy_yamnet[1])

0.844266414642334


In [34]:
best_yamnet_hp.values

{'activation': 'relu',
 'units_1': 512,
 'units_2': 256,
 'units_3': 192,
 'dropout': False,
 'learning_rate': 0.001,
 'batch_size': 12}

# YAMNet Summary
Best accuracy, *84.9%*, at epochs of 10, on the hyper parameters of
```python
{
 'activation': 'relu', 'units_1': 1024, 'units_2': 512,
 'units_3': 128, 'dropout': True, 'learning_rate': 0.001,
 'batch_size': 52, 'epochs' : 5
 }
```