# Data Preprocessing : This notebook is used to create a proper dataset for composer classification

In [None]:
!pip install pretty-midi;
!pip install pandas;

In [11]:
from processor import encode_midi
from processor import decode_midi
import pandas as pd
import numpy as np
maestro = pd.read_csv("maestro-v3.0.0/maestro-v3.0.0.csv")
maestro = maestro.drop(["audio_filename"], axis=1)
maestro = maestro.drop(["year"], axis=1)
maestro.head(3)

Unnamed: 0,canonical_composer,canonical_title,split,midi_filename,duration
0,Alban Berg,Sonata Op. 1,train,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116
1,Alban Berg,Sonata Op. 1,train,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471
2,Alban Berg,Sonata Op. 1,train,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433


We will now encode each file into a sequence of tokens and add it inot the dataset

In [12]:
from tqdm.notebook import tqdm
from ast import literal_eval

encoded_sequence = []
encoded_size = []
for ind in tqdm(maestro.index):
  try:
    new_sequence = encode_midi("maestro-v3.0.0/"+maestro["midi_filename"][ind])
    encoded_sequence.append(new_sequence)
    encoded_size.append(len(new_sequence))
  except:
    encoded_sequence.append(np.nan)
    encoded_size.append(0)
maestro["encoded_sequence"] = encoded_sequence
maestro["encoded_size"] = encoded_size
maestro.head(3)

  0%|          | 0/1276 [00:00<?, ?it/s]

Unnamed: 0,canonical_composer,canonical_title,split,midi_filename,duration,encoded_sequence,encoded_size
0,Alban Berg,Sonata Op. 1,train,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116,"[353, 369, 67, 335, 372, 72, 291, 372, 78, 256...",18318
1,Alban Berg,Sonata Op. 1,train,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,"[355, 256, 369, 67, 323, 371, 72, 292, 373, 78...",18018
2,Alban Berg,Sonata Op. 1,train,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,"[282, 366, 67, 325, 374, 72, 284, 374, 78, 256...",14613


In [42]:
def make_equalizer(data,sequence_size):
  """
  Transforms the dataset into row related to sequences pieces of size sequence_size
  Format : start_ind ; end_ind ; canonical_composer ; canonical_title ; split 
  """
  start_ind = []
  end_ind = []
  canonical_composer = []
  canonical_title = []
  midi_filename = []
  split = []

  for ind in tqdm(data.index):
    if "encoded_sequence" in data.keys():
      # The sequence is known
      try:
        # Sometimes arrays in encoded_sequence are interpreted as a string
        step = len(literal_eval(data["encoded_sequence"][ind]))//sequence_size
      except:
        step = len(data["encoded_sequence"][ind])//sequence_size
      for i in range(step-1):
        start_ind.append(i*sequence_size)
        end_ind.append((i+1)*sequence_size)
        canonical_composer.append(data["canonical_composer"][ind])
        canonical_title.append(data["canonical_title"][ind])
        midi_filename.append(data["midi_filename"][ind])
        split.append(data["split"][ind])
    else:
      # Error in the dataset
      raise Exception()
  
  df = pd.DataFrame()
  df["start_ind"] = start_ind
  df["end_ind"] = end_ind
  df["canonical_composer"] = canonical_composer
  df["canonical_title"] = canonical_title
  df["midi_filename"] = midi_filename
  df["split"] = split

  return df

We can now remove rows that have the same composer name, title and split, keeping the longest piece of music

In [43]:
# Remove every duplicate piece of music, keeping the longest
maestro2 = maestro.sort_values(by=['encoded_size'],ascending=False)
maestro2 = maestro2.drop_duplicates(subset=["canonical_composer","canonical_title","split"],keep='first')
maestro2.reset_index(drop=True,inplace=True)

Finaly we can split sequences and create our datasets

In [40]:
SEQUENCE_SIZE = 128
maestro3 = make_equalizer(maestro2,SEQUENCE_SIZE)
maestro3.head(10)

  0%|          | 0/861 [00:00<?, ?it/s]

Unnamed: 0,start_ind,end_ind,canonical_composer,canonical_title,midi_filename,split
0,0,128,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
1,128,256,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
2,256,384,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
3,384,512,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
4,512,640,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
5,640,768,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
6,768,896,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
7,896,1024,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
8,1024,1152,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train
9,1152,1280,Franz Schubert,"Sonata in D Major, D850",2018/MIDI-Unprocessed_Schubert10-12_MID--AUDIO...,train


In [41]:
maestro3.split.value_counts()

train         111030
test           20730
validation     18311
Name: split, dtype: int64

In [None]:
# Saves the dataset into csv
maestro3.to_csv(f"maestro-{SEQUENCE_SIZE}-unique-titles.csv",index=False)