In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
import scipy.stats.stats as stats
from sklearn.metrics import mean_squared_error

In [2]:
from tensorflow.keras.models import Sequential

In [3]:
from tensorflow.keras.preprocessing import image

In [4]:
import os

In [5]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image

In [79]:
from tensorflow.keras.constraints import max_norm

In [215]:
# load all images into memory
def load_dataset(path):
    photos = list()
    targets = list()
    genres = "Folk Rock Instrumental Pop Hip-Hop Electronic International Experimental"
    genres = genres.split()
    # enumerate files in the directory
    for g in genres:
        for filename in os.listdir(path+g):
            # load image
            photo = image.load_img(path + g + "/" + filename, target_size=(72,108,3), color_mode='rgb')
            # convert to numpy array
            photo = image.img_to_array(photo, dtype='uint8')

            # store
            photos.append(photo)
            targets.append(g)

    X = np.asarray(photos, dtype='uint8')
    y = np.asarray(targets)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    return X[indices], y[indices]


In [216]:
X_train, y_train = load_dataset('./content/spectrograms3sec/train/')

In [217]:
X_test, y_test = load_dataset('./content/spectrograms3sec/test/')

In [171]:
def calculate_mean_std(x, channels_first=False, verbose=0):
    """
    Calculates channel-wise mean and std
    
    Parameters
    ----------
    x : array
        Array representing a collection of images (frames) or
        collection of collections of images (frames) - namely video
    channels_first : bool, optional
        Leave False, by default False
    verbose : int, optional
        1-prints out details, 0-silent mode, by default 0
    
    Returns
    -------
    array of shape [2, num_channels]
        Array with per channel mean and std for all the frames
    """
    ndim = x.ndim
    assert ndim in [5,4]
    assert channels_first == False
    all_mean = []
    all_std = []    
    num_channels = x.shape[-1]
    
    for c in range(0, num_channels):
        if ndim ==5: # videos
            mean = x[:,:,:,:,c].mean()
            std = x[:,:,:,:,c].std()
        elif ndim ==4: # images rgb or grayscale
            mean = x[:,:,:,c].mean()
            std = x[:,:,:,c].std()
        if verbose:
            print("Channel %s mean before: %s" % (c, mean))   
            print("Channel %s std before: %s" % (c, std))
            
        all_mean.append(mean)
        all_std.append(std)
    
    return np.stack((all_mean, all_std))


def preprocess_input(x, mean_std, divide_std=False, channels_first=False, verbose=0):
    """
    Channel-wise substraction of mean from the input and optional division by std
    
    Parameters
    ----------
    x : array
        Input array of images (frames) or videos
    mean_std : array
        Array of shape [2, num_channels] with per-channel mean and std
    divide_std : bool, optional
        Add division by std or not, by default False
    channels_first : bool, optional
        Leave False, otherwise not implemented, by default False
    verbose : int, optional
        1-prints out details, 0-silent mode, by default 0
    
    Returns
    -------
    array
        Returns input array after applying preprocessing steps
    """
    x = np.asarray(x, dtype=np.float32)    
    ndim = x.ndim
    assert ndim in [5,4]
    assert channels_first == False
    num_channels = x.shape[-1]
    
    for c in range(0, num_channels):  
        if ndim ==5: # videos
            x[:,:,:,:,c] -= mean_std[0][c]
            if divide_std:
                x[:,:,:,:,c] /= mean_std[1][c]
            if verbose:
                print("Channel %s mean after preprocessing: %s" % (c, x[:,:,:,:,c].mean()))    
                print("Channel %s std after preprocessing: %s" % (c, x[:,:,:,:,c].std()))
        elif ndim ==4: # images rgb or grayscale
            x[:,:,:,c] -= mean_std[0][c]
            if divide_std:
                x[:,:,:,c] /= mean_std[1][c]   
            if verbose:        
                print("Channel %s mean after preprocessing: %s" % (c, x[:,:,:,c].mean()))    
                print("Channel %s std after preprocessing: %s" % (c, x[:,:,:,c].std()))            
    return x


In [218]:
mean_std = calculate_mean_std(X_train)
X_train = preprocess_input(X_train, mean_std)

In [219]:
mean_std = calculate_mean_std(X_test)
X_test = preprocess_input(X_test, mean_std)

In [220]:
converter = LabelEncoder()
converter.fit(y_train)
y_train = converter.transform(y_train)
y_train = to_categorical(y_train)

In [221]:
y_test = converter.transform(y_test)
y_test = to_categorical(y_test)

In [231]:
tf.keras.backend.clear_session()

In [232]:
conv_net = Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(72, 108, 3)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    #tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.Conv2D(256, (3, 3), activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(8, activation='softmax'),
])
print(conv_net.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 70, 106, 32)       896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 68, 104, 64)       18496     
_________________________________________________________________
batch_normalization (BatchNo (None, 68, 104, 64)       256       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 34, 52, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 50, 64)        36928     
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 50, 64)        256       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 25, 64)        0

In [233]:
tf.config.run_functions_eagerly(True)

In [234]:
# create a callback to prevent overfitting
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

In [235]:

optimizer = tf.keras.optimizers.Adam(learning_rate=0.00005)
loss = tf.keras.losses.CategoricalCrossentropy()
conv_net.compile(optimizer=optimizer,
              loss=loss,
              metrics='accuracy')

In [236]:
conv_net.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, batch_size=128)#, callbacks=[callback])

  "Even though the tf.config.experimental_run_functions_eagerly "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1ab1ff91c08>

In [237]:
pred = conv_net.predict(X_test)

In [238]:
# print a calssification report for the test dataset
print(classification_report(converter.inverse_transform(np.argmax(y_test, axis=1)), converter.inverse_transform(np.argmax(pred, axis=1))))

               precision    recall  f1-score   support

   Electronic       0.57      0.59      0.58       128
 Experimental       0.30      0.59      0.39        44
         Folk       0.55      0.60      0.57       140
      Hip-Hop       0.58      0.69      0.63       140
 Instrumental       0.67      0.61      0.64       134
International       0.46      0.42      0.44        92
          Pop       0.51      0.19      0.28       146
         Rock       0.67      0.75      0.71       170

     accuracy                           0.56       994
    macro avg       0.54      0.56      0.53       994
 weighted avg       0.57      0.56      0.55       994



In [239]:
# print a confusion matric for the test dataset
print(confusion_matrix(converter.inverse_transform(np.argmax(y_test, axis=1)), converter.inverse_transform(np.argmax(pred, axis=1))))

[[ 75   7   7  20   6   4   4   5]
 [  3  26   3   5   3   1   1   2]
 [  4  14  84   3  12  13   4   6]
 [ 12   4   5  97   3   9   6   4]
 [ 10  14  14   4  82   3   3   4]
 [  9   8  11  12   3  39   3   7]
 [ 14   6  21  23   8  11  28  35]
 [  5   9   9   4   5   5   6 127]]
