In [1]:
import os
import glob
from sklearn.model_selection import train_test_split
from mido import MidiFile
import pretty_midi
import IPython.display
from midi2audio import FluidSynth
import numpy as np
import matplotlib.pyplot as plt

In [2]:
genres = ["funk", "jazz", "rock", "latin", "hiphop"]

In [19]:
files = []
labels = []
for i, genre in enumerate(genres):
    for file in glob.glob(r'C:\Users\nooro\Documents\Robotikk\MCT4052\MCT_4052_Project\data\drummer*\session*\*%s*.mid'%genre, recursive=True):
        files.append(file)
        labels.append(i)


#Split data into training, validation and testing sets
tr_data, test_data, tr_labels, test_labels = train_test_split(files, labels, test_size=0.2, random_state=42)
print("All files:", len(files))
print(f"Training set: {len(tr_data)}, testing set: {len(test_data)}")

All files: 868
Training set: 694, testing set: 174


<h2>Preprocessing</h2>

Here we change the pitch values of each recording to lower the amount of features used and reduce redundancy, as well as simplifying reading and understanding the results of the project, the reasoning for these exact changes are discussed in the "Learning to Groove" paper, accompanying the dataset.

In [4]:
def convert_to_simple(list, pitch, convert_from, convert_to):
    if pitch in convert_from:
        list.append(convert_to)
    return list
def feature_extraction(input):
    #These are the pitches representing the main groups of drum instruments used in the project (bass, snare, hi-hat, etc...)
    pitch_values = [36, 38, 50, 47, 43, 46, 42, 49, 51]
    pitches_list = []
    for data in input:
        mid = pretty_midi.PrettyMIDI(data)
        instrument = mid.instruments[0]
        simple_pitches = []
        for note in instrument.notes:
            pitch = note.pitch
            simple_pitches = convert_to_simple(simple_pitches, pitch, [36], 36)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [37, 38, 40], 38)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [48, 50], 50)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [45, 47], 47)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [43, 58], 43)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [26, 46], 46)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [22, 42, 44], 42)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [49, 52, 55, 57], 49)
            simple_pitches = convert_to_simple(simple_pitches, pitch, [51, 53, 59], 51)

        if len(simple_pitches) != len(instrument.notes):
            print("This midi file contains missing notes!", data)
        pitches_list.append(simple_pitches)
    pitch_frequency = [] #Frequency refers to amount of occurences, not audio frequency
    for pitches in pitches_list:
        freq = []
        for i in range(len(pitch_values)):
            freq.append(0)
        for pitch in pitches:
            for i in range(len(pitch_values)):
                if pitch == pitch_values[i]:
                    freq[i] += 1
        pitch_frequency.append(freq)
    return pitch_frequency

In [5]:
tr_features = feature_extraction(tr_data)
test_features = feature_extraction(test_data)

<h2>Training phase

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

##Creating an instance of a MLP classifier
#and setting it some option (max mum epoch, verbose on, activation of neurons)
mlp = MLPClassifier(hidden_layer_sizes=(10,5), max_iter=2000, activation='relu', verbose=True)

#train the model
mlp.fit(tr_features, tr_labels)

Iteration 1, loss = 8.81332372
Iteration 2, loss = 8.44425760
Iteration 3, loss = 8.23405195
Iteration 4, loss = 8.02183578
Iteration 5, loss = 7.82787262
Iteration 6, loss = 7.58856105
Iteration 7, loss = 7.36171360
Iteration 8, loss = 7.11350545
Iteration 9, loss = 6.83955761
Iteration 10, loss = 6.58588927
Iteration 11, loss = 6.36345452
Iteration 12, loss = 6.11175899
Iteration 13, loss = 5.94302352
Iteration 14, loss = 5.76179493
Iteration 15, loss = 5.62702856
Iteration 16, loss = 5.52521483
Iteration 17, loss = 5.38663700
Iteration 18, loss = 5.22421081
Iteration 19, loss = 5.02218560
Iteration 20, loss = 4.83446014
Iteration 21, loss = 4.59444063
Iteration 22, loss = 4.40637500
Iteration 23, loss = 4.22443663
Iteration 24, loss = 4.10946134
Iteration 25, loss = 4.06477673
Iteration 26, loss = 4.00180829
Iteration 27, loss = 3.95592110
Iteration 28, loss = 3.91863212
Iteration 29, loss = 3.90010124
Iteration 30, loss = 3.87001794
Iteration 31, loss = 3.83347069
Iteration 32, los

Iteration 271, loss = 1.45985260
Iteration 272, loss = 1.45615333
Iteration 273, loss = 1.45721267
Iteration 274, loss = 1.45197518
Iteration 275, loss = 1.45192285
Iteration 276, loss = 1.45513533
Iteration 277, loss = 1.45380630
Iteration 278, loss = 1.44962031
Iteration 279, loss = 1.44725259
Iteration 280, loss = 1.44421262
Iteration 281, loss = 1.44560234
Iteration 282, loss = 1.44569417
Iteration 283, loss = 1.44685947
Iteration 284, loss = 1.44586715
Iteration 285, loss = 1.44502744
Iteration 286, loss = 1.45762936
Iteration 287, loss = 1.44289951
Iteration 288, loss = 1.44999835
Iteration 289, loss = 1.44411358
Iteration 290, loss = 1.44572502
Iteration 291, loss = 1.44351814
Iteration 292, loss = 1.43688935
Iteration 293, loss = 1.45436325
Iteration 294, loss = 1.44344411
Iteration 295, loss = 1.43475221
Iteration 296, loss = 1.44544496
Iteration 297, loss = 1.42906294
Iteration 298, loss = 1.42685266
Iteration 299, loss = 1.42702187
Iteration 300, loss = 1.42879673
Iteration 

MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=2000, verbose=True)

In [16]:
#applying the the model on the test data (features)
pred_labels = mlp.predict(test_features)

In [17]:
print("Accuracy is", metrics.accuracy_score(test_labels, pred_labels))
print()
print(metrics.classification_report(test_labels, pred_labels))

Accuracy is 0.41954022988505746

              precision    recall  f1-score   support

           0       0.23      0.16      0.19        43
           1       0.25      0.04      0.07        23
           2       0.45      0.84      0.59        70
           3       0.83      0.24      0.37        21
           4       0.50      0.06      0.11        17

    accuracy                           0.42       174
   macro avg       0.45      0.27      0.27       174
weighted avg       0.42      0.42      0.35       174



Looking at these results we see that the training actually isn't too bad considering the very simple features used. As the accuracy is over 20% its a lot better than guessing, but of course not good enough yet. Moving forward, I will be adding more relevant features to the training, and trying different genres to train on.