# Load Data

In [75]:
import pandas as pd

data = pd.read_csv('data_processed.csv', encoding="latin-1")
data.head()

Unnamed: 0,drummer,session,id,style,bpm,beat_type,time_signature,midi_filename,audio_filename,duration,...,sambareggae,sangueo,secondline,shuffle,slow,soft,songo,soul,swing,venezuelan
0,drummer1,drummer1/eval_session,drummer1/eval_session/1,funk/groove1,138,1,4-4,drummer1/eval_session/1_funk-groove1_138_beat_...,drummer1/eval_session/1_funk-groove1_138_beat_...,27.872308,...,0,0,0,0,0,0,0,0,0,0
1,drummer1,drummer1/eval_session,drummer1/eval_session/10,soul/groove10,102,1,4-4,drummer1/eval_session/10_soul-groove10_102_bea...,drummer1/eval_session/10_soul-groove10_102_bea...,37.691158,...,0,0,0,0,0,0,0,1,0,0
2,drummer1,drummer1/eval_session,drummer1/eval_session/2,funk/groove2,105,1,4-4,drummer1/eval_session/2_funk-groove2_105_beat_...,drummer1/eval_session/2_funk-groove2_105_beat_...,36.351218,...,0,0,0,0,0,0,0,0,0,0
3,drummer1,drummer1/eval_session,drummer1/eval_session/3,soul/groove3,86,1,4-4,drummer1/eval_session/3_soul-groove3_86_beat_4...,drummer1/eval_session/3_soul-groove3_86_beat_4...,44.716543,...,0,0,0,0,0,0,0,1,0,0
4,drummer1,drummer1/eval_session,drummer1/eval_session/4,soul/groove4,80,1,4-4,drummer1/eval_session/4_soul-groove4_80_beat_4...,drummer1/eval_session/4_soul-groove4_80_beat_4...,47.9875,...,0,0,0,0,0,0,0,1,0,0


In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 100 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   drummer                  1090 non-null   object 
 1   session                  1090 non-null   object 
 2   id                       1090 non-null   object 
 3   style                    1090 non-null   object 
 4   bpm                      1090 non-null   int64  
 5   beat_type                1090 non-null   int64  
 6   time_signature           1090 non-null   object 
 7   midi_filename            1090 non-null   object 
 8   audio_filename           1090 non-null   object 
 9   duration                 1090 non-null   float64
 10  split                    1090 non-null   object 
 11  onset_env_mean           1090 non-null   float64
 12  onset_env_std            1090 non-null   float64
 13  mfcc_mean                1090 non-null   float64
 14  mfcc_std               

# Prepare Data

In [77]:
# add the column spectrum_filename to the dataframe data. But not at the end, instead it should follow the column audio_filename
data.insert(data.columns.get_loc('audio_filename') + 1, 'spectrum_filename', '')

In [78]:
data['spectrum_filename'] = data.audio_filename.str.replace('.wav', '.png')

In [79]:
data['spectrum_filename'][0]

'drummer1/eval_session/1_funk-groove1_138_beat_4-4.png'

In [80]:
from sklearn.model_selection import train_test_split

# Split the data
data_train, data_test, = train_test_split(data, test_size=0.3, random_state=42)

# Split again into validation and test (Split is now 70/15/15)
data_test, data_validation = train_test_split(data_train, test_size=0.5, random_state=42,)

In [81]:
# check the distribution after the split
print(data_train.shape, data_test.shape, data_validation.shape, )
print("Train data share: ", data_train.shape[0] / data.shape[0])
print("Validation data share: ", data_validation.shape[0]/ data.shape[0])
print("Test data share: ", data_test.shape[0] / data.shape[0])

(763, 101) (381, 101) (382, 101)
Train data share:  0.7
Validation data share:  0.3504587155963303
Test data share:  0.3495412844036697


# Convolutional Neural Network

Image Generators are the dataloaders for the CNN. They just define how the images are "fed" to the cnn, like where is the path to the images, what are the labels to predict

In [82]:
label_columns = data.columns[32:]
label_columns

Index(['afrobeat', 'afrocuban', 'ando', 'baiao', 'bembe', 'blues', 'bomba',
       'bossa', 'brazilian', 'breakbeat', 'calypso', 'chacarera', 'chacha',
       'country', 'dance', 'disco', 'dominican', 'fast', 'folk', 'frevo',
       'funk', 'fusion', 'gospel', 'groove1', 'groove10', 'groove2', 'groove3',
       'groove4', 'groove5', 'groove6', 'groove7', 'groove8', 'groove9',
       'halftime', 'hiphop', 'ijexa', 'indie', 'jazz', 'joropo', 'klezmer',
       'latin', 'linear', 'maracatu', 'march', 'mediumfast', 'merengue',
       'middleeastern', 'motown', 'neworleans', 'pop', 'prog', 'punk',
       'purdieshuffle', 'reggae', 'reggaeton', 'rhumba', 'rock', 'rockabilly',
       'samba', 'sambareggae', 'sangueo', 'secondline', 'shuffle', 'slow',
       'soft', 'songo', 'soul', 'swing', 'venezuelan'],
      dtype='object')

In [83]:
label_columns.shape

(69,)

In [84]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

path_images = os.path.join('Datasets', 'spectrums')

# Normalize images
image_generator = ImageDataGenerator(
    rescale=1.0/255
)

# Define the data generators
train_generator = image_generator.flow_from_dataframe(
    dataframe=data_train,
    directory=path_images,
    x_col="spectrum_filename",
    y_col=label_columns,
    target_size=(250, 100),
    batch_size=32,
    class_mode="raw",
    color_mode="rgb" # add color mode
)

val_generator = image_generator.flow_from_dataframe(
    dataframe=data_validation,
    directory=path_images,
    x_col="spectrum_filename",
    y_col=label_columns,
    target_size=(250, 100),
    batch_size=32,
    class_mode="raw",
    color_mode="rgb", #add color mode,
    shuffle=False,  # this is crucial for later evaluation!
)

test_generator = image_generator.flow_from_dataframe(
    dataframe=data_test,
    directory=path_images,
    x_col="spectrum_filename",
    y_col=label_columns,
    target_size=(250, 100),
    batch_size=32,
    class_mode="raw",
    color_mode="rgb", #add color mode,
    shuffle=False,  # this is crucial for later evaluation!
)

Found 162 validated image filenames.
Found 88 validated image filenames.
Found 74 validated image filenames.




In [85]:
next(train_generator)[0].shape

(32, 250, 100, 3)

In [86]:
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator

# Define the input shape
inputs = Input(shape=(250, 100, 3))

# Define the CNN architecture
x = Conv2D(32, (3, 3), activation='relu')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(69, activation="sigmoid")(x)

# Create the model
cnn_model = Model(inputs=inputs, outputs=outputs)

cnn_model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 250, 100, 3)]     0         
                                                                 
 conv2d_25 (Conv2D)          (None, 248, 98, 32)       896       
                                                                 
 max_pooling2d_25 (MaxPoolin  (None, 124, 49, 32)      0         
 g2D)                                                            
                                                                 
 conv2d_26 (Conv2D)          (None, 122, 47, 64)       18496     
                                                                 
 max_pooling2d_26 (MaxPoolin  (None, 61, 23, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_27 (Conv2D)          (None, 59, 21, 128)       7385

In [87]:
import tensorflow as tf

def compileCNN(cnn):
    metrics = [
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]

    # Compile the model
    cnn.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics="accuracy") #metrics)
    
    return cnn

In [88]:
def createCallbacks(path):
    # Create a callback that saves the model's weights
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=path,
                                                    save_weights_only=True,    # saving only the weights, because we have the architecture of the model
                                                    verbose=1, 
                                                    monitor='val_accuracy',    # we are monitoring the accuracy on the validation set
                                                    mode='max',                # the greatest accuracy on the validation is the best outcome
                                                    save_best_only=True)       # we only want to save the best model. The other chechpoints are not interesting to us
    
    early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

    return checkpoint_cb, early_stopping_cb

In [89]:
def trainCNN(cnn, train_generator, validation_generator, checkpoint_callback, early_stopping_callback):
    history = cnn.fit(
        train_generator,
        epochs=100,
        validation_data=validation_generator,
        verbose=1,
        callbacks=[checkpoint_callback, early_stopping_callback],  # Pass callback to training
    )
    return cnn

In [90]:
cnn_path = os.path.join('data', 'models', 'first_cnn.ckpt')

try:
    cnn_model.load_weights(cnn_path)
except:
    cnn_model = compileCNN(cnn_model)
    checkpoint_callback, early_stopping_callback = createCallbacks(cnn_path)
    cnn_model = trainCNN(cnn_model, train_generator, val_generator, checkpoint_callback, early_stopping_callback)

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.00000, saving model to data\models\first_cnn.ckpt
Epoch 2/100
Epoch 2: val_accuracy improved from 0.00000 to 0.06818, saving model to data\models\first_cnn.ckpt
Epoch 3/100
Epoch 3: val_accuracy improved from 0.06818 to 0.13636, saving model to data\models\first_cnn.ckpt
Epoch 4/100
Epoch 4: val_accuracy improved from 0.13636 to 0.36364, saving model to data\models\first_cnn.ckpt
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.36364
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.36364
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.36364


In [124]:
y_pred = cnn_model.predict(test_generator)



In [125]:
y_pred_df = pd.DataFrame(y_pred)

In [126]:
y_pred_df[y_pred_df > 0.1] = 1


In [128]:
from sklearn.metrics import multilabel_confusion_matrix
from evaluation import evaluate_model, plot_confusion_matrix

multi_conf_matrix = multilabel_confusion_matrix(data_test[label_columns], y_pred_df)
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[:10])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[10:20])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[20:30])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[30:40])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[40:50])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[50:60])
plot_confusion_matrix(multi_conf_matrix, list(label_columns), label_columns[60:69])

ValueError: Found input variables with inconsistent numbers of samples: [381, 74]