# Plan
Five models. Each one is hypertuned with respect to learning rate, unit number, optimizer, batch size, epoch, data normalization, dropout?, dropout rate, activation choice, 

The four models are 
- 1 mlp model
- 1 vgg16 model
- 1 yamnet
- 1 LSTM
- a hybrid of yamnet and LSTM

The feature used
- MFCCs for mlp
- Mel Spectrogram for vgg16
- raw audio waves for sound models

Tuned hyper parameters
- learning rate
- unit number
- structure, e.g. whether to contain one more layer, whether to contain dropout, whether to do data normalization
- dropout rate
- activation choice
- batch size
- epochs

In [1]:
!pip install keras-tuner -q
import os
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
import keras_tuner
from tensorflow.keras import layers
import librosa

from tensorflow.keras.utils import to_categorical, plot_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten, Conv2D,MaxPooling2D
from tensorflow.keras.optimizers import Adam

from sklearn import metrics

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from datetime import datetime 
from sklearn.model_selection import train_test_split

# for manipulate the mel spectrographs
from PIL import Image

import tensorflow_hub as hub

from google.colab import drive
drive.mount('/content/drive')


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/176.1 kB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


## Load Data

## Data Exploration

In [2]:
# the root of the data
dataRoot = "drive/MyDrive/DeepLearningProject/datasets/Data/"

# VGG16

In [3]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

In [4]:
'''
A helper function to extract the labels from the file names
@fileName - file name of a music
return - the label
e.g. label = extract_label("blues0000.png")
'''
def extract_label(fileName):
  import re

  match = re.search(r'([a-zA-Z ]+)(\d+)', fileName)

  if match:
      # text = match.group(1).strip()
      text = match.group(1)
  else:
    raise ValueError("Failed to extracte labels from Music file name, "+fileName)

  return text
extract_label("blues0000.png")

'blues'

In [5]:
# Define the root directory
root_dir = os.path.join(dataRoot, "images_original")

target_size = (224, 224)

# save the images and labels
images = []
labels = []

# Iterate through all directories under the root directory
for dirpath, dirnames, filenames in os.walk(root_dir):
    # dirpath is the path of the current directory
    # dirnames is a list of subdirectories in the current directory
    # filenames is a list of files in the current directory
    
    # Do something with the directory path, such as print it
    
    # print(dirnames)
    
    for fname in filenames:
      # deal with the music
      musicPath = os.path.join(dirpath,fname)
      music = Image.open(musicPath)

      music = music.convert('RGB')
      # music = music.resize((150,150))
      music = np.array(music)
      images.append(music)

      # get its label
      labels.append(extract_label(fname))

In [6]:
# Convert the lists to NumPy arrays
images = np.array(images)
labels = np.array(labels)

In [7]:
images[10].shape

(288, 432, 3)

In [8]:
## Loading VGG16 model
vgg16_base_model = VGG16(weights="imagenet", include_top=False, input_shape=images[0].shape)

## will not train base mode
vgg16_base_model.trainable = False 

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [9]:
images[0].shape

(288, 432, 3)

In [10]:
images.shape

(999, 288, 432, 3)

In [11]:
labels.shape

(999,)

In [12]:
# check whether we got every music by comparing the total numbers
labelsDf = pd.DataFrame(columns=['label'])
for dirpath, dirnames, filenames in os.walk(root_dir):    
    for fname in filenames:
      newRow = pd.DataFrame({'label': [extract_label(fname)]})
      labelsDf = pd.concat([labelsDf, newRow])
labelsDf['label'].value_counts()

blues        100
metal        100
hiphop       100
rock         100
disco        100
country      100
pop          100
reggae       100
classical    100
jazz          99
Name: label, dtype: int64

In [13]:
# this dict is used to convert string labels to numerical labels
labelsDf['label'].value_counts()
label_dict = {}
code = 0
for _label in labelsDf['label'].value_counts().index:
  label_dict[_label] = code
  code += 1
label_dict

{'blues': 0,
 'metal': 1,
 'hiphop': 2,
 'rock': 3,
 'disco': 4,
 'country': 5,
 'pop': 6,
 'reggae': 7,
 'classical': 8,
 'jazz': 9}

In [14]:
# encoding
labels_encoded = []
for _l in labels:
  labels_encoded.append(label_dict[_l])
labels_encoded = np.array(labels_encoded)

In [15]:
labels_encoded.shape

(999,)

In [16]:
# check the distribution of the data. One short but it's okay.
dfLabel = pd.DataFrame({"label_code": labels_encoded})
dfLabel[['label_code']].value_counts()

label_code
0             100
1             100
2             100
3             100
4             100
5             100
6             100
7             100
8             100
9              99
dtype: int64

In [17]:
labels_encoded_ct = to_categorical(labels_encoded)
labels_encoded_ct.shape

(999, 10)

In [18]:
### Number of classes
num_labels=labels_encoded_ct.shape[1]

In [19]:
imgShape = np.array(images[0]).shape

In [20]:
images_cleaned = preprocess_input(images)
images_cleaned.shape

images_scaled = images/255

In [21]:
# prepare the training and testing data

method = 0;

# extract features
if method == 0:
  vgg16features = vgg16_base_model.predict(images)
if method == 1:
  vgg16features = vgg16_base_model.predict(images_scaled)
if method == 2:
  vgg16features = vgg16_base_model.predict(images_cleaned)


### Train Test Split
X_cnn_train, X_cnn_test, y_cnn_train, y_cnn_test = train_test_split(vgg16features,labels_encoded_ct,test_size=0.2,random_state=0)



In [22]:
# do inference, i.e. get features from vgg16

In [23]:
class VGG16HyperModel(keras_tuner.HyperModel):
    def build(self, hp):
        model = keras.Sequential()

        model.add(Flatten())
        
        activatoin = hp.Choice("activation", ["relu", "tanh"])

        model.add(Dense(
              units = hp.Int("units_1", 1024, 3072, step = 512),
              activation = activatoin)
        )
        if hp.Boolean("dropout_1"):
            model.add(Dropout(0.25))

        model.add(Dense(
              units = hp.Int("units_2", 512, 3072, step = 512),
              activation = activatoin)
        )
        if hp.Boolean("dropout_2"):
            model.add(Dropout(0.25))

        model.add(Dense(
              units = hp.Int("units_3", 32, 1024, step = 64),
              activation = activatoin)
        )
        if hp.Boolean("dropout_3"):
            model.add(Dropout(0.25))

        if hp.Boolean("has_forth_layer"):
              model.add(Dense(
                units = hp.Int("units_4", 32, 1024, step = 64),
                activation = activatoin)
        )
        if hp.Boolean("dropout_4"):
              model.add(Dropout(0.25))

        model.add(Dense(num_labels, activation = "softmax"))  

        # Define the optimizer learning rate as a hyperparameter.
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])

        # Compile the model.
        model.compile(
            loss="categorical_crossentropy", metrics=["accuracy"], optimizer = Adam(learning_rate=learning_rate),
        )

        return model

    def fit(self, hp, model, X, y, **kwargs):

        return model.fit(
            X,
            y,
            validation_split=0.2,
            batch_size = hp.Int("batch_size", 4,36,step=8),
            **kwargs,
        )   
        
tuner = keras_tuner.RandomSearch(
    VGG16HyperModel(),
    objective="val_accuracy",
    overwrite=True,
    executions_per_trial = 2,
    max_trials=20,
    directory="vgg16_tuner",
    project_name='vgg16'
)
print(tuner.search_space_summary())

Search space summary
Default search space size: 10
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 1024, 'max_value': 3072, 'step': 512, 'sampling': 'linear'}
dropout_1 (Boolean)
{'default': False, 'conditions': []}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 512, 'max_value': 3072, 'step': 512, 'sampling': 'linear'}
dropout_2 (Boolean)
{'default': False, 'conditions': []}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 1024, 'step': 64, 'sampling': 'linear'}
dropout_3 (Boolean)
{'default': False, 'conditions': []}
has_forth_layer (Boolean)
{'default': False, 'conditions': []}
dropout_4 (Boolean)
{'default': False, 'conditions': []}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}
None


In [24]:
# # a quick test
# hp = keras_tuner.HyperParameters()
# hypermodel = VGG16HyperModel()
# model = hypermodel.build(hp)
# hypermodel.fit(hp, model, vgg16features, y_cnn_train, epochs = 5)

In [25]:
# search begain
early_stop = EarlyStopping(monitor='val_loss', patience=3)
# uncomment this
tuner.search(X_cnn_train, y_cnn_train,callbacks = [keras.callbacks.TensorBoard("vgg16_tuner")], epochs = 5)
# tuner.search(vgg16features, y_cnn_train,callbacks = [keras.callbacks.TensorBoard("mlp_tuner")], epochs = 3)

Trial 20 Complete [00h 00m 27s]
val_accuracy: 0.12187500298023224

Best val_accuracy So Far: 0.6968750059604645
Total elapsed time: 00h 07m 10s


In [26]:
vgg16_hp_model = VGG16HyperModel()
best_vgg16_hp = tuner.get_best_hyperparameters()[0]
best_vgg16_model = vgg16_hp_model.build(best_vgg16_hp)

early_stop = EarlyStopping(monitor='val_loss', patience=5)
vgg16_hp_model.fit(best_vgg16_hp, best_vgg16_model, X_cnn_train, y_cnn_train, verbose=1, epochs = 50, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<keras.callbacks.History at 0x7fc5b83fa3e0>

In [27]:
best_vgg16_model.save_weights("best_vgg16_model.h5")

In [28]:
# Accuracy on the test set
test_accuracy_vgg16 = best_vgg16_model.evaluate(X_cnn_test,y_cnn_test,verbose=1)
print(test_accuracy_vgg16[1])

0.675000011920929


In [29]:
best_vgg16_hp.values

{'activation': 'relu',
 'units_1': 2560,
 'dropout_1': False,
 'units_2': 1536,
 'dropout_2': False,
 'units_3': 416,
 'dropout_3': False,
 'has_forth_layer': False,
 'dropout_4': False,
 'learning_rate': 0.001,
 'batch_size': 20}

# Summary for VGG16
Accuracy is 65.5% on the test set. Run 9 epochs on,
```python
{
 'activation': 'relu', 'units_1': 3072, 'dropout_1': True,
 'units_2': 1536, 'dropout_2': False, 'units_3': 288,
 'dropout_3': False, 'has_forth_layer': False, 'dropout_4': False,
 'learning_rate': 0.001, 'units_4': 736, 'batch_size': 28, 
 'epochs': 6
 }
```
Try two ways to scale data, 1. images/255; 2. use vgg16 preprocess, but no preprocess gave the best result, this.