In [13]:
#installations
!pip install torchaudio
!sudo apt install -y fluidsynth
!pip install --upgrade pyfluidsynth
!pip install pretty_midi

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fluidsynth is already the newest version (2.2.5-1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
[31mERROR: Operation cancelled by user[0m[31m


In [2]:
#imports
import os
import music21
import pretty_midi
from music21 import midi
import pandas as pd
from IPython.display import Audio
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
import statistics
import math, random, time
from IPython.display import Audio
from google.colab import drive
from google.colab import files
import torchaudio
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
import scipy
import shutil

In [3]:
#mount drive, establish data directory
from google.colab import drive
drive.mount('/content/drive')
data_directory = '/content/drive/MyDrive/maestro-v2.0.0/'

Mounted at /content/drive


In [4]:
# define midi reading function
def readMidi(filepath):
  mf = midi.MidiFile()
  mf.open(filepath)
  mf.read()
  mf.close()
  return mf

In [5]:
# Need a token dictionary and an inverse token dictionary
token_dict = {}
inverse_token_dict = {}
for i in range(128):
  token_dict[i] = i
  inverse_token_dict[i] = i

token_dict['<rest>'] = 128
token_dict['<bos>'] = 129
token_dict['<eos>'] = 130

inverse_token_dict[128] = '<rest>'
inverse_token_dict[129] = '<bos>'
inverse_token_dict[130] = '<eos>'

In [18]:
import os
import shutil
from tqdm import tqdm
import pretty_midi
import numpy as np
import torch
from torch.utils.data import Dataset

class MaestroDataset(Dataset):
    def __init__(self, split, token_dict, directory='/content/drive/MyDrive/maestro-v2.0.0/'):
        import pandas as pd

        self.split = split
        self.token_dict = token_dict
        self.directory = directory.rstrip('/') + '/'
        # read the CSV and filter to our split
        df = pd.read_csv(os.path.join(self.directory, 'maestro-v2.0.0.csv'))
        self.df = df[df['split'] == split].reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def moveFilesToSplitDir(self):
        src_root = self.directory
        dst_root = os.path.join(self.directory, self.split)
        os.makedirs(dst_root, exist_ok=True)

        for fname in self.df['midi_filename']:
            src = os.path.join(src_root, fname)
            if not os.path.isfile(src):
                print(f" ⚠️  Warning: MIDI file not found: {src}")
                continue
            dst = os.path.join(dst_root, os.path.basename(fname))
            shutil.move(src, dst)

    def create_token_sequences(self,
                               pm: pretty_midi.PrettyMIDI,
                               fs: int = 16,
                               seq_len: int = 500,
                               add_bos_eos: bool = True):
        pianoroll = pm.get_piano_roll(fs=fs)
        pianoroll = (pianoroll > 0).astype(int)

        num_frames = pianoroll.shape[1]
        seqs = []
        for start in range(0, num_frames, seq_len):
            end = start + seq_len
            if end > num_frames:
                break
            block = pianoroll[:, start:end]
            # skip mostly-silent windows
            if block.sum() == 0:
                continue

            seq = []
            if add_bos_eos:
                seq.append(self.token_dict['<bos>'])
            for t in range(seq_len):
                notes = np.nonzero(block[:, t])[0]
                if len(notes):
                    seq.append(int(notes[0]))
                else:
                    seq.append(self.token_dict['<rest>'])
            if add_bos_eos:
                seq.append(self.token_dict['<eos>'])
            seqs.append(seq)
        return seqs

    def augment_idxs(self, seq):
        augmented = []
        for shift in range(-12, +13):
            out = []
            for idx in seq:
                # only shift real MIDI notes; leave special tokens alone
                if 0 <= idx < 128:
                    n = idx + shift
                    n = max(0, min(127, n))
                    out.append(n)
                else:
                    out.append(idx)
            augmented.append(out)
        return augmented

    def create_all_token_sequences(self,
                                   fs: int = 16,
                                   seq_len: int = 500):
        split_dir = os.path.join(self.directory, self.split)
        all_seqs = []

        for fn in tqdm(os.listdir(split_dir)):
            if not fn.lower().endswith('.midi'):
                continue
            path = os.path.join(split_dir, fn)
            try:
                pm = pretty_midi.PrettyMIDI(path)
            except Exception as e:
                print(f" ⚠️  Skipping {fn}: {e}")
                continue

            seqs = self.create_token_sequences(pm, fs=fs, seq_len=seq_len)
            for s in seqs:
                all_seqs.append(s)
                all_seqs.extend(self.augment_idxs(s))

        return all_seqs

    def tokens_seqs_to_txt(self, seqs, phase):
        split_dir = os.path.join(self.directory, self.split)
        out_path = os.path.join(split_dir, f"{phase}.txt")
        with open(out_path, 'w') as f:
            for seq in seqs:
                f.write(" ".join(map(str, seq)) + "\n")


In [20]:
# build token dict once
token_dict = {i:i for i in range(128)}
token_dict.update({'<rest>':128, '<bos>':129, '<eos>':130})

for phase in ['train', 'test', 'validation']:
    ds = MaestroDataset(phase, token_dict,
                        directory='/content/drive/MyDrive/maestro-v2.0.0/')

    ds.moveFilesToSplitDir()
    seqs = ds.create_all_token_sequences(fs=16, seq_len=500)
    ds.tokens_seqs_to_txt(seqs, phase)



100%|██████████| 116/116 [01:12<00:00,  1.61it/s]




100%|██████████| 56/56 [00:41<00:00,  1.34it/s]




100%|██████████| 59/59 [00:45<00:00,  1.30it/s]
