In [12]:
# packages
import os
import cv2
from tensorflow import keras
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
import sklearn
import tensorflow as tf
import pandas as pd

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import GlobalMaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Embedding

In [3]:
GTZAN = {'nGenres': 10, 'genres': ['blues', 'classical', 'country', 'disco', 'hiphop', 
                                   'jazz', 'metal', 'pop', 'reggae', 'rock']}
EXTENDED_BALLROOM = {'nGenres': 9, 'genres': ['chacha', 'foxtrot', 'jive', 'quickstep', 'rumba',
                                              'samba', 'tango', 'viennesewaltz', 'waltz']}
FEATURES = ('stft', 'mfcc', 'melspectrogram', 'fouriertempogram')
RNG_STATE = 422
TEST_SIZE = 0.2
EPOCHS = 50
BATCH_SIZE = 64

# Processing

Librosa for feature extraction of audio files\
STFT | MFCC | Melspectrogram | Fourier Tempogram

GTZAN (10 Genres)\
Extended Ballroom (9 Genres - removed 4 for having less than 100 samples)

In [None]:
def process(dataset, feature, time_split, verbose = False, save = False):
    N_FFT = 2048
    HOP_LENGTH = 1024
    N_MELS = 64
    N_MFCC = 13
    
    processed = []
    processed_genres = []
    
    feature = feature.lower()
    aud_path = os.path.join(os.getcwd(), "datasets", dataset, "audio")
    if not os.path.exists(aud_path):
        print("Dataset not found:", aud_path)
        return
        
    features = ("stft", "melspectrogram", "mfcc", "fouriertempogram")
    TOTAL_GENRES = len(os.listdir(aud_path))

    if feature not in features:
        print(f"{feature} not available for extraction")
        return
    
    feat_path = os.path.join(os.getcwd(), "datasets", dataset, "features")
    os.path.isdir(feat_path) or os.makedirs(feat_path)
    
    currentGenre = 0
    for folder in os.listdir(aud_path):
        folder_path = os.path.join(aud_path, folder)
        for audio in os.listdir(folder_path):  
            audio_path = os.path.join(folder_path, audio)
            y, sr = librosa.load(audio_path) # can user offset and duration for 5 second intervals
            if feature == "stft":
                feat = np.abs(librosa.stft(y, hop_length = HOP_LENGTH, n_fft = N_FFT))
            elif feature == "mfcc":
                feat = librosa.feature.mfcc(y = y, sr = sr, hop_length = HOP_LENGTH, 
                                                      n_fft = N_FFT)
            elif feature == "melspectrogram":
                feat = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = N_MELS, n_fft = N_FFT,
                                                     hop_length = HOP_LENGTH)
            elif feature == "fouriertempogram":
                oenv = librosa.onset.onset_strength(y = y, sr = sr, hop_length = HOP_LENGTH)
                feat = librosa.feature.fourier_tempogram(onset_envelope = oenv, sr = sr, hop_length = HOP_LENGTH)
                #feat = librosa.feature.tempogram(onset_envelope = oenv, sr = sr, hop_length = HOP_LENGTH)
            feat = librosa.power_to_db(feat ** 2) # feat ** 2 for amplitude to db
            #if len(feat[0]) != 647:
            #    print(len(feat[0]))
            #print(feat.shape)
            #print(feat[:2, :512].shape)
            #librosa.display.specshow(feat[:, :], sr=sr, hop_length = HOP_LENGTH, x_axis = 'time', y_axis = 'mel')
            #print(feat.shape)
            
            # max is (x, 647) or (x, 648) -- imma treat max as 640 -- can be //10,8,5,4,2 -> 3, 3.75, 6, 7.5, 15
            out = feat[:, :640]

            # split into segments -- 3, 3.75, 6, 7.5, 15 seconds
            out_split = np.hsplit(out, time_split)
            for split in out_split:
                #print(split.shape)
                processed.append(split)
                # one hot encoding
                processed_genres.append([1 if i == currentGenre else 0 for i in range(TOTAL_GENRES)])
                #librosa.display.specshow(split[:, :], sr=sr, hop_length = HOP_LENGTH, x_axis = 'time', y_axis = 'mel')
            #print(len(out_split))
            #librosa.display.specshow(out[:, :], sr=sr, hop_length = HOP_LENGTH, x_axis = 'time', y_axis = 'mel')
        currentGenre += 1
        if verbose:
            print(folder, "complete;", feature, processed[0].shape)
    
    X = np.array(processed)
    Y = np.array(processed_genres)
    
    
    if save:
        np.save(os.path.join(feat_path, f"{feature + str(time_split)}-X.npy"), X)
        np.save(os.path.join(feat_path, f"{feature + str(time_split)}-Y.npy"), Y)
    
    return X, Y

### Run to extract features if have not done so already

Extracting features from Extended Ballroom taks a while, 10x+ longer than GTZAN

In [None]:
#features = ("stft", "melspectrogram", "mfcc", "fouriertempogram",)
features = ("mfcc",)
#datasets = ("GTZAN", "Extended Ballroom",)
datasets = ("Extended Ballroom",)
splits = (10, 8, 5, 4, 2, 1)
for dataset in datasets:
    print("Processing", dataset, "with total genres of", 
              len(os.listdir(os.path.join(os.getcwd(), "datasets", dataset, "audio"))))
    for feature in features:
        start = time.perf_counter()
        for split in splits:
            process(dataset, feature, split, verbose = True, save = True)
        print(f"\t{feature} with splits", str(splits), "took", time.perf_counter() - start, "seconds")

# Training

### References

yang2020parallel
![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

![image-3.png](attachment:image-3.png)

In [4]:
def getFeatures(dataset, feature, split):
    # LOADING
    X = np.load(os.path.join(os.getcwd(), "datasets", dataset, "features", f"{feature}{split}-X.npy"))
    Y = np.load(os.path.join(os.getcwd(), "datasets", dataset, "features", f"{feature}{split}-Y.npy"))
    #SPLITTING
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = TEST_SIZE, random_state = RNG_STATE)
    
    return (X_train, X_test, Y_train, Y_test)

In [5]:
def cnf_matrix():
    plt.figure(figsize=(11,11))
    cm = sklearn.metrics.confusion_matrix(np.argmax(Y_test, axis=-1), np.argmax(m.predict(X_test), axis=-1))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.xticks(list(range(10)), ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'])
    plt.yticks(list(range(10)), ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'])
    plt.title('Confusion matrix for DCNN model')
    plt.xlabel('Predicted genre')
    plt.ylabel('True genre')
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.show()

## Base CNN

Explain general structure of what i did

### STFT (1025, 640)

In [None]:
def conv_block(x, n_filters, kernel_size, conv_strides, pool_strides, pool_size):
    x = Conv2D(n_filters, kernel_size, strides = conv_strides, padding='same')(x)
    #x = BatchNormalization()(x) # should do this
    x = Activation('relu')(x)
    
    # reduce overfitting by oversampling? extracts most prominent features
    x = MaxPooling2D(pool_size=pool_size, strides=pool_size)(x)
    x = Dropout(0.25)(x) # to reduce overfitting # shouldnt do this when its first level
    return x

In [None]:
def CNN(input_shape, num_genres):
    inpt = Input(shape=input_shape)
    
    x = conv_block(inpt, n_filters = 16, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = conv_block(x, n_filters = 32, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = conv_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = conv_block(x, n_filters = 128, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (4, 4), pool_size = (4, 4))
    
    x = conv_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu', 
              kernel_regularizer = tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.25)(x)
    predictions = Dense(num_genres, 
                        activation='softmax', 
                        kernel_regularizer = tf.keras.regularizers.l2(0.02))(x)
    
    model = Model(inputs=inpt, outputs=predictions)
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "stft", 10)

# Model
m = CNN(X_train.shape[1:] + (1,), 10)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = EPOCHS, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-cnn-stft.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "stft", 10)

In [None]:
# Model
m = CNN(X_train.shape[1:] + (1,), EXTENDED_BALLROOM['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 10, batch_size = BATCH_SIZE) 
# shuffle = True and use callbacks to perform early stopping or save model each epoch or save history, loss, acc, each epoch

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-cnn-stft.h5')

### MFCC (20, 640)

In [None]:
def MFCC_block(x, n_filters, kernel_size, conv_strides, pool_strides, pool_size):
    x = Conv2D(n_filters, kernel_size, strides = conv_strides, padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    # reduce overfitting by oversampling? extracts most prominent features
    x = MaxPooling2D(pool_size=pool_size, strides=pool_size)(x)
    x = Dropout(0.25)(x) # to reduce overfitting # shouldnt do this when its first level
    return x

In [None]:
def MFCC_CNN(input_shape, num_genres):
    inpt = Input(shape=input_shape)
    
    x = MFCC_block(inpt, n_filters = 32, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = MFCC_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = MFCC_block(x, n_filters = 128, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = MFCC_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = Flatten()(x)
    x = Dropout(0.25)(x)
    x = Dense(256, activation='relu', 
              kernel_regularizer = tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.25)(x)
    predictions = Dense(num_genres, 
                        activation='softmax', 
                        kernel_regularizer = tf.keras.regularizers.l2(0.02))(x)
    
    model = Model(inputs=inpt, outputs=predictions)
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "mfcc", 8)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = MFCC_CNN(X_train.shape[1:] + (1,), 10)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 75, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-cnn-mfcc.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "mfcc", 8)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = MFCC_CNN(X_train.shape[1:] + (1,), EXTENDED_BALLROOM['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 75, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-cnn-mfcc.h5')

### Melspectrogram (64, 640)

In [None]:
def mel_block(x, n_filters, kernel_size, conv_strides, pool_strides, pool_size):
    x = Conv2D(n_filters, kernel_size, strides = conv_strides, padding='same')(x)
    #x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    # reduce overfitting by oversampling? extracts most prominent features
    x = MaxPooling2D(pool_size=pool_size, strides=pool_size)(x)
    x = Dropout(0.25)(x) # to reduce overfitting # shouldnt do this when its first level
    return x

In [None]:
def MEL_CNN(input_shape, num_genres):
    inpt = Input(shape=input_shape)
    
    x = mel_block(inpt, n_filters = 16, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = mel_block(x, n_filters = 32, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = mel_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = mel_block(x, n_filters = 128, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (4, 4), pool_size = (4, 4))
    
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu', 
              kernel_regularizer = tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.25)(x)
    
    predictions = Dense(num_genres, 
                        activation='softmax', 
                        kernel_regularizer = tf.keras.regularizers.l2(0.02))(x)
    
    model = Model(inputs=inpt, outputs=predictions)
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "melspectrogram", 4)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = MEL_CNN(X_train.shape[1:] + (1,), 10)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 100, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-cnn-melspectrogram.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "melspectrogram", 4)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = MEL_CNN(X_train.shape[1:] + (1,), EXTENDED_BALLROOM['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 20, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-cnn-melspectrogram.h5')

### Fourier Tempogram (193, 640)

In [None]:
def temp_block(x, n_filters, kernel_size, conv_strides, pool_strides, pool_size, dropout = None):
    x = Conv2D(n_filters, kernel_size, strides = conv_strides, padding='same')(x)
    #x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    # reduce overfitting by oversampling? extracts most prominent features
    x = MaxPooling2D(pool_size=pool_size, strides=pool_size)(x)
    if dropout:
        x = Dropout(dropout)(x) # to reduce overfitting # shouldnt do this when its first level
    return x

In [None]:
def TEMP_CNN(input_shape, num_genres):
    inpt = Input(shape=input_shape)
    
    x = temp_block(inpt, n_filters = 16, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2))
    
    x = temp_block(x, n_filters = 32, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2), dropout = 0.1)
    
    x = temp_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2), dropout = 0.1)
    
    x = temp_block(x, n_filters = 128, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2), dropout = 0.1)
    
    x = temp_block(x, n_filters = 64, kernel_size = (3, 3), 
                   conv_strides = (1, 1), pool_strides = (2, 2), pool_size = (2, 2), dropout = 0.1)
    
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu', 
              kernel_regularizer = tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.25)(x)
    
    predictions = Dense(num_genres, 
                        activation='softmax', 
                        kernel_regularizer = tf.keras.regularizers.l2(0.02))(x)
    
    model = Model(inputs=inpt, outputs=predictions)
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "fouriertempogram", 8)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = TEMP_CNN(X_train.shape[1:] + (1,), 10)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 50, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-cnn-fouriertempogram.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "fouriertempogram", 8)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = TEMP_CNN(X_train.shape[1:] + (1,), 9)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 3, batch_size = BATCH_SIZE)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-cnn-fouriertempogram.h5')

## Base RNN

RNN or Bi-RNN

Explain general structure

### STFT (1025, 640)

In [None]:
def STFT_RNN(input_shape, num_genres):
    model = Sequential()
    
    # 1st layer
    #model.add(Bidirectional(LSTM(input_shape[1], return_sequences = True), input_shape = input_shape))
    model.add(LSTM(128, return_sequences = True, input_shape = input_shape))
    
    # 2nd layer
    #model.add(Bidirectional(LSTM(input_shape[1] // 2)))
    model.add(LSTM(32, dropout = 0.05, recurrent_dropout = 0.05))
    
    model.add(Dense(num_genres, activation = 'softmax'))
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "stft", 4)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = TEMP_RNN(X_train.shape[1:], GTZAN['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 50, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-rnn-fouriertempogram.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "stft", 5)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = TEMP_RNN(X_train.shape[1:], EXTENDED_BALLROOM['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 50, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-rnn-fouriertempogram.h5')

### Fourier Tempogram (193, 640)

In [None]:
def TEMP_RNN(input_shape, num_genres):
    model = Sequential()
    
    # 1st layer
    #model.add(Bidirectional(LSTM(input_shape[1], return_sequences = True), input_shape = input_shape))
    model.add(LSTM(input_shape[1], return_sequences = True, input_shape = input_shape))
    
    # 2nd layer
    #model.add(Bidirectional(LSTM(input_shape[1] // 2)))
    model.add(LSTM(input_shape[1] // 2))
    
    model.add(Dense(num_genres))
    model.add(Activation('softmax'))
    return model

#### GTZAN

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("GTZAN", "fouriertempogram", 8)
print(X_train.shape)
plt.imshow(X_train[1])
# Model
m = TEMP_RNN(X_train.shape[1:], 10)
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 50, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/gtzan-rnn-fouriertempogram.h5')

#### Extended Ballroom

In [None]:
# Get Features
X_train, X_test, Y_train, Y_test = getFeatures("Extended Ballroom", "fouriertempogram", 8)
print(X_train.shape)
#plt.imshow(X_train[1])
# Model
m = TEMP_RNN(X_train.shape[1:], EXTENDED_BALLROOM['nGenres'])
opt = keras.optimizers.Adam(lr = 0.001)
m.compile(optimizer = opt, loss = "categorical_crossentropy", metrics = ['categorical_accuracy'])
m.summary()

In [None]:
m.fit(X_train, Y_train, epochs = 50, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
preds = m.evaluate(X_train, Y_train)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
preds = m.evaluate(X_test, Y_test)
#print(X_test.shape)
print("Loss:", preds[0])
print("Train Accuracy:", preds[1])

In [None]:
m.save('models/extended_ballroom-rnn-fouriertempogram.h5')

# Base Results

In [34]:
models = pd.DataFrame(columns = ['Model', 'Dataset', 'Feature', 'Loss', 'Accuracy'])
for model in os.listdir(os.path.join(os.getcwd(), 'models')):
    info = model[:-3].split('-')
    data = info[0].split('_')
    data = ' '.join([s.capitalize() for s in data])
    if data.lower() == 'gtzan':
        data = data.upper()
    mo = info[1]
    split = int(info[2])
    feat = info[3]
    #print(data, mo, split, feat)
    m = keras.models.load_model(os.path.join(os.getcwd(), 'models', model))
    X_train, X_test, Y_train, Y_test = getFeatures(data, feat, split)
    a, l = m.evaluate(X_test, Y_test)
    models = models.append(pd.DataFrame({'Model': [mo.upper() + '-' + str(split)], 'Dataset': [data],
                                         'Feature': [feat], 'Loss': [l], 'Accuracy': [a]}))
models = models.set_index('Model')
models = models.sort_values(by = ['Accuracy'])



In [35]:
models

Unnamed: 0_level_0,Dataset,Feature,Loss,Accuracy
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CNN-10,Extended Ballroom,stft,0.850827,0.609495
CNN-4,GTZAN,melspectrogram,0.8275,0.619323
CNN-8,Extended Ballroom,mfcc,0.829054,0.628138
CNN-4,Extended Ballroom,melspectrogram,0.855667,0.674813
CNN-10,GTZAN,stft,0.8465,0.739848
CNN-8,Extended Ballroom,fouriertempogram,0.813244,0.756606
CNN-8,GTZAN,mfcc,0.728125,0.936257
CNN-8,GTZAN,fouriertempogram,0.635,1.33154


# Parallel

CNN + RNN

CNN + CNN

One or multiple combinations, possibly with different inputs (features)

jeong2016learning
![image.png](attachment:image.png)

# Test

Use recent songs and test