# **Sample Testing**

In [None]:
!pip install pretty-midi muspy miditok

Collecting pretty-midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting muspy
  Downloading muspy-0.5.0-py3-none-any.whl.metadata (5.5 kB)
Collecting miditok
  Downloading miditok-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting mido>=1.1.16 (from pretty-midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Collecting bidict>=0.21 (from muspy)
  Downloading bidict-0.23.1-py3-none-any.whl.metadata (8.7 kB)
Collecting miditoolkit>=0.1 (from muspy)
  Downloading miditoolkit-1.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting pypianoroll>=1.0 (from muspy)
  Downloading pypianoroll-1.0.4-py3-none-any.whl.metadata (3.8 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.6-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting pySmartDL (from

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import random
import muspy
import pretty_midi
import os
import pickle
import random

from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import GPT2LMHeadModel
from miditok import REMI, TokenizerConfig, TokSequence


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
metadata = pd.read_csv("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0.csv")

In this notebook we are going to generate some samples from every model, so that we can then test their performance. First, we are going to begin with the Many-to-One models. Let's load the necessary dataset:

In [None]:
class MusicDatasetManyToOne(Dataset):
    def __init__(self, metadata, seq_len):
        super().__init__()
        self.seq_len = seq_len
        self.songs = []

        for file in metadata["midi_filename"]:
            file_path = os.path.join("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0-midi/maestro-v3.0.0/", file)
            muspy_file = muspy.read(file_path)
            self.songs.append(muspy_file)

    def __len__(self):
        return len(self.songs)

    def __getitem__(self, idx):
        pianoroll_song = self.songs[idx].to_pianoroll_representation()
        pianoroll_song = torch.tensor(pianoroll_song, dtype=torch.float32)

        start_idx = random.randint(0, len(pianoroll_song) - self.seq_len - 1)

        input_seq = pianoroll_song[start_idx:start_idx+self.seq_len]

        target = pianoroll_song[start_idx+self.seq_len]

        return input_seq, target

In [None]:
music_dataset_test = torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/datasets/test/music_dataset_test_many_to_one_muspy.pt", weights_only=False)

This is the function to generate music. It creates a for loop where the model generates the next tone, based on the previous 100 ones.

In [None]:
def generate_music(model, initial_sequence, seq_len=100, max_generate_len=5000):
    input_seq = initial_sequence
    generated_music = input_seq.squeeze(0)
    input_seq = input_seq.unsqueeze(0)

    model.eval()
    with torch.no_grad():
        for _ in tqdm(range(max_generate_len)):
            output = model(input_seq)
            next_step = output
            generated_music = torch.cat((generated_music, next_step), dim=0)
            input_seq = generated_music[-seq_len:].unsqueeze(0)

    return generated_music

This is the function to convert the pianoroll files into MIDI, using pretty_midi. It accepts `time_step` and `threshold` as hyperparameters, which are going to be made different for every model, based on subjective listening.
`time_step` defines the speed of the song, while `threshold` is the minimum value of the tone the model has to return in order for the tone to be played.

In [None]:
def pianoroll_to_midi(pianoroll, filepath, time_step=0.1, threshold=0.01):
    pianoroll = pianoroll.numpy()
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)
    current_pitches = {}

    for i in range(pianoroll.shape[1]):
        current_pitches[i] = 0

    for time in range(pianoroll.shape[0]):
        for pitch in range(pianoroll.shape[1]):
            if pianoroll[time, pitch] > threshold and current_pitches[i] == 0:
                current_pitches[pitch] = time
            if current_pitches[pitch] != 0 and pianoroll[time, pitch] <= threshold:
                note = pretty_midi.Note(velocity=100, pitch=pitch, start=current_pitches[pitch] * time_step, end=(time + 1) * time_step)
                instrument.notes.append(note)
                current_pitches[pitch] = 0

    midi.instruments.append(instrument)

    midi.write(filepath)

Let's begin with the GRU model. Here we define it, initialize it and load the one after 150 epochs.

In [None]:
class ManyToOneGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(ManyToOneGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.gru(x, h0)
        out = out[:, -1, :]
        out = self.fc(out)

        return out

In [None]:
many_to_one_gru = ManyToOneGRU(128, 256, 128, 2)
many_to_one_gru.load_state_dict(torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/saved_models/many_to_one_gru_fourth_try_epoch_150.pt", map_location=torch.device("cpu")))

  many_to_one_gru.load_state_dict(torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/saved_models/many_to_one_gru_fourth_try_epoch_150.pt", map_location=torch.device("cpu")))


<All keys matched successfully>

In this loop, we generate 10 songs, that go into the "/saved_data/samples/many_to_one_gru/" directory. The threshold value was changed in order to produce a somewhat acceptable result.

In [None]:
common_path = "/content/drive/MyDrive/Piano generation/Project_draft/saved_data/samples/many_to_one_gru"
for i in range(10):
    initial_seq = music_dataset_test[random.randint(0, len(music_dataset_test) - 1)][0]
    generated_music = generate_music(many_to_one_gru, initial_seq, seq_len=100, max_generate_len=1000)
    pianoroll_to_midi(generated_music, os.path.join(common_path, f"many_to_one_gru_generated_song_{i+1}.mid"), threshold=0.1)

100%|██████████| 1000/1000 [00:15<00:00, 63.17it/s]
100%|██████████| 1000/1000 [00:17<00:00, 58.02it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.47it/s]
100%|██████████| 1000/1000 [00:15<00:00, 64.71it/s]
100%|██████████| 1000/1000 [00:15<00:00, 64.71it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.19it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.30it/s]
100%|██████████| 1000/1000 [00:15<00:00, 64.92it/s]
100%|██████████| 1000/1000 [00:15<00:00, 64.35it/s]
100%|██████████| 1000/1000 [00:15<00:00, 65.17it/s]


Let's do exactly the same with the LSTM model.

In [None]:
class ManyToOneLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(ManyToOneLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, (h_n, c_n) = self.lstm(x, (h0, c0))
        out = self.fc(h_n[-1])

        return out

In [None]:
many_to_one_lstm = ManyToOneLSTM(128, 256, 128, 2)
many_to_one_lstm.load_state_dict(torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/saved_models/many_to_one_lstm_third_try_epoch_150.pt", map_location=torch.device("cpu")))

Again, we generate 10 samples and save them in "saved_data/samples/many_to_one_lstm/".

In [None]:
common_path = "/content/drive/MyDrive/Piano generation/Project_draft/saved_data/samples/many_to_one_lstm"
for i in range(10):
    initial_seq = music_dataset_test[random.randint(0, len(music_dataset_test) - 1)][0]
    generated_music = generate_music(many_to_one_lstm, initial_seq, seq_len=100, max_generate_len=1000)
    pianoroll_to_midi(generated_music, os.path.join(common_path, f"many_to_one_lstm_generated_song_{i+1}.mid"), threshold=0.1)

100%|██████████| 1000/1000 [00:12<00:00, 80.45it/s]
100%|██████████| 1000/1000 [00:12<00:00, 80.41it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.52it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.67it/s]
100%|██████████| 1000/1000 [00:13<00:00, 72.72it/s]
100%|██████████| 1000/1000 [00:12<00:00, 78.39it/s]
100%|██████████| 1000/1000 [00:12<00:00, 78.44it/s]
100%|██████████| 1000/1000 [00:12<00:00, 77.78it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.27it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.50it/s]


Now it's time for the Encoder-Decoder architecture. Let's again define and load the new dataset we have to use for this one.

In [None]:
class MusicDataset(Dataset):
    def __init__(self, metadata, seq_len):
        super().__init__()
        self.seq_len = seq_len
        self.songs = []

        for file in metadata["midi_filename"]:
            file_path = os.path.join("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0-midi/maestro-v3.0.0/", file)
            muspy_file = muspy.read(file_path)
            self.songs.append(muspy_file)

    def __len__(self):
        return len(self.songs)

    def __getitem__(self, idx):
        pianoroll_song = self.songs[idx].to_pianoroll_representation()
        pianoroll_song = torch.tensor(pianoroll_song, dtype=torch.float32)

        start_idx = random.randint(0, len(pianoroll_song) - 2 * self.seq_len)

        input_seq = pianoroll_song[start_idx:start_idx+self.seq_len]

        target = pianoroll_song[start_idx+(self.seq_len // 2) : start_idx+self.seq_len+(self.seq_len // 2)]

        return input_seq, target

In [None]:
music_dataset_test_many_to_many = torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/datasets/test/music_dataset_test_many_to_many_muspy.pt")

Here we initialize and load the trained model:

In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        _, (h_n, c_n) = self.lstm(x)

        h_n = h_n.view(self.num_layers, 2, x.size(0), self.hidden_size)
        c_n = c_n.view(self.num_layers, 2, x.size(0), self.hidden_size)

        h_n = h_n.sum(dim=1)
        c_n = c_n.sum(dim=1)

        return h_n, c_n

In [None]:
class DecoderLSTM(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(output_size, hidden_size, num_layers, batch_first=True)
        self.ff = nn.Linear(hidden_size, output_size)

    def forward(self, x, h_n, c_n):
        output, (h_n, c_n) = self.lstm(x, (h_n, c_n))
        output = self.ff(output)
        return output, h_n, c_n

In [None]:
class Seq2SeqLSTM(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqLSTM, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, target_seq_len, teacher_forcing_ratio=0.5):

        batch_size = x.size(0)
        output_size = self.decoder.ff.out_features
        outputs = torch.zeros(batch_size, target_seq_len, output_size).to(x.device)

        h_n, c_n = self.encoder(x)

        decoder_input = torch.zeros(batch_size, 1, output_size).to(x.device)

        for t in range(target_seq_len):
            output, h_n, c_n = self.decoder(decoder_input, h_n, c_n)
            outputs[:, t, :] = output.squeeze(1)

            if torch.rand(1).item() < teacher_forcing_ratio:
                decoder_input = x[:, t, :].unsqueeze(1)
            else:
                decoder_input = output

        return outputs

In [None]:
encoder = EncoderLSTM(input_size=128, hidden_size=1024, num_layers=2)
decoder = DecoderLSTM(output_size=128, hidden_size=1024, num_layers=2)

encoder_decoder_lstm = Seq2SeqLSTM(encoder, decoder)
encoder_decoder_lstm.load_state_dict(torch.load("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/saved_models/encoder_decoder_lstm_epoch_120.pt", map_location=torch.device("cpu")))

This time, the generate_music function has to be different in order to work with the different nature pf the model.

In [None]:
def generate_music_many_to_many(model, initial_sequence, seq_len=100, max_generate_len=5000):
    input_seq = initial_sequence
    generated_music = input_seq.squeeze(0)
    input_seq = input_seq.unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_seq, max_generate_len, 0.0)
        return output.squeeze()

In [None]:
common_path = "/content/drive/MyDrive/Piano generation/Project_draft/saved_data/samples/encoder_decoder_lstm"
for i in tqdm(range(10)):
    initial_seq = music_dataset_test_many_to_many[random.randint(0, len(music_dataset_test_many_to_many) - 1)][0]
    generated_music = generate_music_many_to_many(encoder_decoder_lstm, initial_seq, seq_len=100, max_generate_len=500)
    pianoroll_to_midi(generated_music, os.path.join(common_path, f"encoder_decoder_lstm_generated_song_{i+1}.mid"), time_step=0.1, threshold=0.03)

100%|██████████| 10/10 [01:56<00:00, 11.64s/it]


At last, it's time to get samples from our GPT2 model. Let's again initialize the tokenizer and the model. From subjective listening to samples from both, I found the samples from the one after 10 epochs (the one that overfitted) to be better-sounding (although very far from good).

In [None]:
tokenizer_config = TokenizerConfig(
    num_velocities=16,
    use_chords=True,
    use_programs=False,
    use_rests=True,
    use_tempos=True,
    use_time_signatures=False,
    use_sustain_pedals=True,
)

tokenizer = REMI(tokenizer_config)

In [None]:
vocab_size = len(tokenizer.vocab)
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/saved_models/pretrained_gpt2_seventh_try_epoch_10.pt")
model.resize_token_embeddings(vocab_size)

Embedding(346, 768)

In [None]:
def generate_music_gpt2(model):
    input_ids = torch.tensor([[tokenizer.vocab['BOS_None']]])

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=512,
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.vocab['PAD_None'],
            eos_token_id=tokenizer.vocab['EOS_None'],
            attention_mask=torch.ones_like(input_ids)
        )

    return output[0].tolist()

In [None]:
def generated_to_midi(generated_ids):
    reverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
    generated_tokens = [reverse_vocab.get(token_id, "[UNK]") for token_id in generated_ids]

    tok_seq = TokSequence(tokens=generated_tokens, ids=generated_ids)
    tokenizer.complete_sequence(tok_seq)

    midi = tokenizer([tok_seq])
    return midi

In [None]:
for i in tqdm(range(10)):
    generated_ids = generate_music_gpt2(model)
    midi = generated_to_midi(generated_ids)
    midi.dump_midi(os.path.join("/content/drive/MyDrive/Piano generation/Project_draft/saved_data/samples/gpt2", f"gpt2_generated_song_{i+1}.mid"))

100%|██████████| 10/10 [02:15<00:00, 13.59s/it]
