In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import (Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, 
Permute, Reshape, GRU)
from tensorflow.python.keras import utils
import json
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import Model
from keras.layers import Input
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers.merge import concatenate
from keras.utils import plot_model
import tifffile 

with open("../Genre_Track_Id_Dict.json",'r') as j:
        id_genre_dict = json.load(j)
numerical_labels = dict(zip(list(id_genre_dict.keys()),np.arange(0,8)))


### The idea of this notebook is to fuse the methods presented in:
https://arxiv.org/pdf/1901.04555.pdf AND 
Bottom up broadcast neural network

In [None]:
data = np.load("Spectrogram_Data.npy")
labels = np.load("Spectrogram_Data_Labels.npy")

In [None]:

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.30, shuffle=True)
X_train = X_train.reshape(X_train.shape[0], 128, 647, 1)
X_test = X_test.reshape(X_test.shape[0], 128, 647, 1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.20, shuffle=True)

print("Training Shape: {} ... {}".format(X_train.shape,y_train.shape))
print("Testing Shape: {} ... {}".format(X_test.shape,y_test.shape))
print("Validation Shape: {} ... {}".format(X_val.shape,y_val.shape))

In [None]:
visible = Input(shape=(128,647,1))

rcnn_res = crnn_module(visible)
broadcast_res = broadcast_module(visible)

both_modules = concatenate([rcnn_res,broadcast_res])

out = layers.Dense(8, activation="softmax")(both_modules)

model = Model(inputs=visible, outputs=out)


In [None]:
def crnn_module(input_tensor):
    """
    will return rcnn module as seen in:
    https://github.com/keunwoochoi/music-auto_tagging-keras/blob/master/music_tagger_crnn.py
    """
    nb_layers = 4  # number of convolutional layers
    nb_filters = [64, 128, 128, 128]  # filter sizes
    kernel_size = (3, 3)  # convolution kernel size
    activation = 'elu'  # activation function to use after each layer
    pool_size = [(2, 2), (4, 2), (4, 2), (4, 2),
                 (4, 2)]  # size of pooling area


    input_shape = input_tensor.shape
    frequency_axis = 1
    time_axis = 2
    channel_axis = 3


    bn_1 = BatchNormalization(axis=frequency_axis, input_shape=input_shape)(input_tensor)
    conv_1 = Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
                     data_format="channels_last",
                     input_shape=input_shape,activation='elu')(bn_1)

    bn_2 = BatchNormalization(axis=channel_axis)(conv_1)
    mp_1 = MaxPooling2D(pool_size=pool_size[0], strides=pool_size[0])(bn_2)
    drop_n = Dropout(0.1)(mp_1)

    for layer in range(nb_layers - 1):
        conv_n = Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
                         padding='same', activation = 'elu')(drop_n)
        bn_n = BatchNormalization(axis=channel_axis)(conv_n)
        mp_n = MaxPooling2D(pool_size=pool_size[layer + 1],
                               strides=pool_size[layer + 1])(bn_n)
        drop_n = Dropout(0.1)(mp_n)

    perm = Permute((time_axis, frequency_axis, channel_axis))(drop_n)
    resize_shape = drop_n.shape[1] * drop_n.shape[3]
    reshaped = Reshape((drop_n.shape[2], resize_shape))(perm)

    gru_1 = GRU(32, return_sequences=True)(reshaped)
    gru_2 = GRU(32, return_sequences=False)(gru_1)
    output = Dropout(0.3)(gru_2)
    
    return output


def broadcast_module(input_tensor):
    """
    Will run input tensor through broadcast module as described in:
    
    """
    paddings = tf.constant([[0, 0],[1,1],[1,1],[0,0]])
    padded_input = tf.pad(input_tensor,paddings,"CONSTANT")
    x = Conv2D(32,(3,3),activation='relu',input_shape=(128,647,1))(padded_input)
    x = BatchNormalization(axis=-1)(x)
    incept_1_input = MaxPooling2D(pool_size=(1,4),name='incept_1_input')(x)

    f1 = 32
    f2_in = 64
    f2_out = 32
    f3_in = 16
    f3_out = 32
    f4_out = 32

    #Inception Module
    incept_1 = inception_module(incept_1_input, f1, f2_in, f2_out, f3_in, f3_out, f4_out,1)

    incept_2_input = concatenate([incept_1,incept_1_input],name='incept_2_input')
    incept_2 = inception_module(incept_2_input, f1, f2_in, f2_out, f3_in, f3_out, f4_out,2)

    incept_3_input = concatenate([incept_2_input,incept_2],name='incept_3_input')
    incept_3 = inception_module(incept_3_input, f1, f2_in, f2_out, f3_in, f3_out, f4_out,3)

    incept_3_output = concatenate([incept_3,incept_3_input],name='incept_3_output')

    #Transition Layers
    x = BatchNormalization(axis=-1)(Conv2D(32,(1,1))(incept_3_output))
    x = layers.MaxPooling2D(pool_size=(2,2),strides=2)(x)
    x = layers.GlobalAveragePooling2D()(x)

    return x


# function for creating a projected inception module
def inception_module(layer_in, f1, f2_in, f2_out, f3_in, f3_out, f4_out, n):
    # 1x1 conv
    conv1 = Conv2D(f1, (1,1), padding='same', activation='relu',name=f'1x1_conv__{n}')(BatchNormalization(axis=-1)(layer_in))
    # 3x3 conv
    conv3 = Conv2D(f2_in, (1,1), padding='same', activation='relu',)(BatchNormalization(axis=-1)(layer_in))
    conv3 = Conv2D(f2_out, (3,3), padding='same', activation='relu',name=f'3x3_conv__{n}')(BatchNormalization(axis=-1)(conv3))
    # 5x5 conv
    conv5 = Conv2D(f3_in, (1,1), padding='same', activation='relu')(BatchNormalization(axis=-1)(layer_in))
    conv5 = Conv2D(f3_out, (5,5), padding='same', activation='relu',name=f'5x5_conv__{n}')(BatchNormalization(axis=-1)(conv5))
    # 3x3 max pooling
    pool = MaxPooling2D((3,3), strides=(1,1), padding='same')(layer_in)
    pool = Conv2D(f4_out, (1,1), padding='same', activation='relu')(BatchNormalization(axis=-1)(pool))
    # concatenate filters, assumes filters/channels last
    layer_out = concatenate([conv1, conv3, conv5, pool], axis=-1)
    return layer_out