In [13]:
import cPickle
import datetime
import numpy as np
from os import listdir
from os.path import isfile, join
from random import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
data_dir = 'ml_data'

In [3]:
midi_files = set()
for csv_file in listdir(data_dir):
    midi_file = '.'.join(csv_file.split('.')[:2])
    midi_files.add(midi_file)
print(len(midi_files))
for midi_file in sorted(midi_files):
    print(midi_file)

25
chpn-p10_format0.mid
chpn-p11_format0.mid
chpn-p12_format0.mid
chpn-p13_format0.mid
chpn-p14_format0.mid
chpn-p15_format0.mid
chpn-p16_format0.mid
chpn-p17_format0.mid
chpn-p18_format0.mid
chpn-p19_format0.mid
chpn-p1_format0.mid
chpn-p20_format0.mid
chpn-p21_format0.mid
chpn-p22_format0.mid
chpn-p23_format0.mid
chpn-p24_format0.mid
chpn-p2_format0.mid
chpn-p3_format0.mid
chpn-p4_format0.mid
chpn-p5_format0.mid
chpn-p6_format0.mid
chpn-p7_format0.mid
chpn-p8_format0.mid
chpn-p9_format0.mid
scale.mid


In [4]:
# midi_files_list = list(midi_files)
# offset = int(len(midi_files_list)*0.8)
# shuffle(midi_files_list)
# train_set = set(midi_files_list[:offset])
# test_set = set(midi_files_list[offset:])
test_set = {'chpn-p12_format0.mid', 'chpn-p14_format0.mid', 'chpn-p5_format0.mid'}
train_set = midi_files - test_set
print(train_set)
print(test_set)
print(len(train_set))
print(len(test_set))

set(['chpn-p6_format0.mid', 'chpn-p24_format0.mid', 'chpn-p1_format0.mid', 'chpn-p23_format0.mid', 'chpn-p17_format0.mid', 'chpn-p13_format0.mid', 'chpn-p18_format0.mid', 'chpn-p19_format0.mid', 'chpn-p21_format0.mid', 'scale.mid', 'chpn-p15_format0.mid', 'chpn-p2_format0.mid', 'chpn-p3_format0.mid', 'chpn-p7_format0.mid', 'chpn-p11_format0.mid', 'chpn-p16_format0.mid', 'chpn-p22_format0.mid', 'chpn-p20_format0.mid', 'chpn-p10_format0.mid', 'chpn-p9_format0.mid', 'chpn-p4_format0.mid', 'chpn-p8_format0.mid'])
set(['chpn-p14_format0.mid', 'chpn-p5_format0.mid', 'chpn-p12_format0.mid'])
22
3


In [5]:
def read_data(data_dir, train_set, test_set, note_from=0, note_to=127, take_every_nth_negative_sample=1, sep=';'):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for csv_file in listdir(data_dir):
        csv_file_split = csv_file.split('.')
        midi_file = '.'.join(csv_file_split[:2])
        note = int(csv_file_split[2])
        if note >= note_from and note <= note_to:
            if midi_file in train_set:
                read_file(join(data_dir, csv_file), X_train, y_train, take_every_nth_negative_sample, sep)
            elif midi_file in test_set:
                read_file(join(data_dir, csv_file), X_test, y_test, 1, sep)
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)


def read_file(path_to_file, X, y, take_every_nth_negative_sample, sep):
    with open(path_to_file) as f:
        y_value_0_count = 0
        for line in f:
            line_split = line.rstrip().split(sep)
            line_length = len(line_split)
            
            X_value = [float(number) for number in line_split[:line_length-1]]
            y_value = int(line_split[line_length-1])
            if y_value == 1:
                X.append(X_value)
                y.append(y_value)
            else:
                if y_value_0_count % take_every_nth_negative_sample == 0:
                    X.append(X_value)
                    y.append(y_value)
                y_value_0_count += 1

In [9]:
print('read start: ' + str(datetime.datetime.now()))
# X_train, y_train, X_test, y_test = read_data(
#     data_dir, train_set, test_set, note_from=51, note_to=72, take_every_nth_negative_sample=250
# )
# X_train, y_train, X_test, y_test = read_data(
#     data_dir, train_set, test_set, note_from=24, note_to=50, take_every_nth_negative_sample=750
# )
X_train, y_train, X_test, y_test = read_data(data_dir, train_set, test_set, take_every_nth_negative_sample=1)
print('read end: ' + str(datetime.datetime.now()))

read start: 2016-12-18 16:08:55.649000
read end: 2016-12-18 16:18:35.413000


In [10]:
print(X_train.shape)
print(X_test.shape)
print(sum(y_train))
print(sum(y_test))
print(float(sum(y_train))/X_train.shape[0])
print(float(sum(y_test))/X_test.shape[0])

(51379380L, 10L)
(3180918L, 10L)
10952
824
0.000213159442562
0.000259044716022


In [12]:
print('fit start: ' + str(datetime.datetime.now()))
estimator = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs=6))
estimator.fit(X_train, y_train)
print('fit end: ' + str(datetime.datetime.now()))
print(classification_report(y_train, estimator.predict(X_train)))
print(classification_report(y_test, estimator.predict(X_test)))
print('predict end: ' + str(datetime.datetime.now()))

fit start: 2016-12-18 16:20:11.970000
fit end: 2016-12-18 17:07:33.381000
             precision    recall  f1-score   support

          0       1.00      1.00      1.00  51368428
          1       1.00      0.85      0.92     10952

avg / total       1.00      1.00      1.00  51379380

             precision    recall  f1-score   support

          0       1.00      1.00      1.00   3180094
          1       0.81      0.50      0.62       824

avg / total       1.00      1.00      1.00   3180918

predict end: 2016-12-18 17:08:27.155000


In [14]:
with open('random_forest_22_midis.cpickle', 'wb') as f:
    cPickle.dump(estimator, f)

In [15]:
with open('random_forest_22_midis.cpickle', 'rb') as f:
    loaded_estimator = cPickle.load(f)
print(classification_report(y_test, loaded_estimator.predict(X_test)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00   3180094
          1       0.81      0.50      0.62       824

avg / total       1.00      1.00      1.00   3180918

