<a href="https://colab.research.google.com/github/fazekas-gergo/composer-ai/blob/master/Create_DataFrame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create DataFrame
Create and save DataFrame from the following dataset: https://www.kaggle.com/soumikrakshit/classical-music-midi 

---
Download necessary files and install packages.

In [1]:
!git clone -l -s https://github.com/fazekas-gergo/composer-ai composer-ai
%cd composer-ai
!pip install pretty_midi

Cloning into 'composer-ai'...
remote: Enumerating objects: 420, done.[K
remote: Counting objects: 100% (420/420), done.[K
remote: Compressing objects: 100% (412/412), done.[K
remote: Total 420 (delta 42), reused 348 (delta 5), pack-reused 0[K
Receiving objects: 100% (420/420), 160.79 MiB | 14.10 MiB/s, done.
Resolving deltas: 100% (42/42), done.
/content/composer-ai
Collecting pretty_midi
  Downloading pretty_midi-0.2.9.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 5.3 MB/s 
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 6.6 MB/s 
Building wheels for collected packages: pretty-midi
  Building wheel for pretty-midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty-midi: filename=pretty_midi-0.2.9-py3-none-any.whl size=5591955 sha256=3c4efc8f3824c0d4692a8451e51a0093a40ee26b11b51f809331db7ef7915f53
  Stored in directory: /root/.cache/pip/wheels/ad/74/7c/a06473ca8dcb63efb98c1e6

In [5]:
import os
import pretty_midi
import pandas as pd

Create a list of the midi files.

In [4]:
def get_midi_file_paths(main_folder):
  files = []
  for dir in os.listdir(main_folder):
    for file in os.listdir(main_folder + os.path.sep + dir):
      files.append(main_folder + os.path.sep + dir + os.path.sep + file)
  return files

midi_file_paths = get_midi_file_paths('classical_music_midi')
print('Number of read MIDI files:', len(midi_file_paths))

Number of read MIDI files: 295


The following block creates one pandas dataframe from the midi files.<br>
Only the  *piano left* and *piano right* instruments are used. In case of the midi file does not contain these instruments, then that file is ignored .<br>
<br>
The alorighm groups the notes by their start to handle them together.<br>
Indexes:

*   name of the piano piece
*   number index of the note



In [6]:
from pretty_midi.instrument import Instrument

def midi_files_to_df(paths):
  data_frames = [midi_file_to_df(file_path) for file_path in paths]
  names = list(map(lambda p: p.replace("classical_music_midi/", ""), paths))
  return pd.concat(data_frames, keys=names)

def midi_file_to_df(midi_file_name: str):
  m = pretty_midi.PrettyMIDI(midi_file_name)
  piano_left = list(filter(lambda inst: inst.name.lower() == 'piano left', m.instruments))
  piano_right = list(filter(lambda inst: inst.name.lower() == 'piano right', m.instruments))
  if len(piano_left) == 1 and len(piano_right) == 1:
    print('use file:', midi_file_name)
    return __piano_to_df(piano_left[0], piano_right[0])
  else:
    print('ignore file:', midi_file_name)

def __piano_to_df(piano_left: Instrument, piano_right: Instrument):
  piano_notes_df = __create_df_from_piano_notes(piano_left, piano_right)
  return __group_notes(piano_notes_df)
  

def __create_df_from_piano_notes(piano_left: Instrument, piano_right: Instrument):
  piano_left_df = __create_df_from_instrument_notes(piano_left)
  piano_right_df = __create_df_from_instrument_notes(piano_right)
  return pd.concat([piano_left_df, piano_right_df]).sort_values(['start']).reset_index()

def __create_df_from_instrument_notes(instrument: Instrument):
  return pd.DataFrame([{
        'start': n.start, 
        'end': n.end, 
        'pitch': n.pitch, 
        'velocity': n.velocity
    } for n in instrument.notes])

def __group_notes(df: pd.DataFrame):
  dataset = []
  for start, note_df in df.groupby(['start'], sort=True):
    pitches = __create_pitch_arr(note_df)
    velocity = note_df['velocity'].mean()
    dataset.append([pitches, velocity, start])
  new_df = pd.DataFrame(dataset, columns=['pitches', 'velocity', 'start'])
  new_df['duration'] = new_df['start'].diff(periods=-1)*(-1)
  return new_df.drop(['start'], axis=1).iloc[:-1]

def __create_pitch_arr(note_df):
  return [int(d['pitch']) for _, d in note_df.sort_values(['pitch']).iterrows()]

dataset = midi_files_to_df(midi_file_paths)

use file: classical_music_midi/haydn/haydn_9_3.mid
use file: classical_music_midi/haydn/haydn_8_2.mid
use file: classical_music_midi/haydn/haydn_9_2.mid
use file: classical_music_midi/haydn/haydn_35_1.mid
use file: classical_music_midi/haydn/haydn_35_2.mid
use file: classical_music_midi/haydn/haydn_43_3.mid
use file: classical_music_midi/haydn/haydn_9_1.mid
use file: classical_music_midi/haydn/haydn_8_4.mid
use file: classical_music_midi/haydn/haydn_7_3.mid
use file: classical_music_midi/haydn/haydn_33_1.mid
use file: classical_music_midi/haydn/haydn_43_1.mid
use file: classical_music_midi/haydn/haydn_43_2.mid
use file: classical_music_midi/haydn/hay_40_2.mid
use file: classical_music_midi/haydn/haydn_8_3.mid
use file: classical_music_midi/haydn/haydn_33_2.mid
use file: classical_music_midi/haydn/hay_40_1.mid
use file: classical_music_midi/haydn/haydn_35_3.mid
use file: classical_music_midi/haydn/haydn_7_1.mid
use file: classical_music_midi/haydn/haydn_33_3.mid
use file: classical_musi

In [7]:
dataset

Unnamed: 0,Unnamed: 1,pitches,velocity,duration
haydn/haydn_9_3.mid,0,[72],40.000000,0.251446
haydn/haydn_9_3.mid,1,"[65, 69, 77]",35.666667,0.127866
haydn/haydn_9_3.mid,2,[72],40.000000,0.127866
haydn/haydn_9_3.mid,3,[77],41.000000,0.127866
haydn/haydn_9_3.mid,4,[72],42.000000,0.127866
...,...,...,...,...
liszt/liz_rhap12.mid,3105,"[44, 48, 51, 56, 75, 80, 84, 87]",77.375000,0.277136
liszt/liz_rhap12.mid,3106,"[37, 41, 44, 49, 77, 80, 85, 89]",77.375000,0.277136
liszt/liz_rhap12.mid,3107,"[32, 36, 39, 44, 80, 84, 87, 92]",77.375000,0.277137
liszt/liz_rhap12.mid,3108,"[37, 41, 44, 49, 85, 89, 92, 97]",77.375000,1.078652


In [14]:
dataset.index

MultiIndex([( 'haydn/haydn_9_3.mid',    0),
            ( 'haydn/haydn_9_3.mid',    1),
            ( 'haydn/haydn_9_3.mid',    2),
            ( 'haydn/haydn_9_3.mid',    3),
            ( 'haydn/haydn_9_3.mid',    4),
            ( 'haydn/haydn_9_3.mid',    5),
            ( 'haydn/haydn_9_3.mid',    6),
            ( 'haydn/haydn_9_3.mid',    7),
            ( 'haydn/haydn_9_3.mid',    8),
            ( 'haydn/haydn_9_3.mid',    9),
            ...
            ('liszt/liz_rhap12.mid', 3100),
            ('liszt/liz_rhap12.mid', 3101),
            ('liszt/liz_rhap12.mid', 3102),
            ('liszt/liz_rhap12.mid', 3103),
            ('liszt/liz_rhap12.mid', 3104),
            ('liszt/liz_rhap12.mid', 3105),
            ('liszt/liz_rhap12.mid', 3106),
            ('liszt/liz_rhap12.mid', 3107),
            ('liszt/liz_rhap12.mid', 3108),
            ('liszt/liz_rhap12.mid', 3109)],
           length=357130)

Save dataset

In [15]:
dataset.to_pickle('./dataset.pkl')