In [1]:
# %pip install miditok

In [2]:
import os
from pathlib import Path

from miditok import REMI
from miditok import MusicTokenizer, TokSequence
from miditok.classes import TokenizerConfig

import torch
import random
from random import shuffle

from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# set the random seed
seed = 42
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x207c0ddf2f0>

In [4]:
# set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [5]:
# download the dataset and set path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("soumikrakshit/classical-music-midi")

print("Path to dataset files:", path)

artist = "chopin"
midi_folder = os.path.join(path, artist)

Path to dataset files: C:\Users\ianmh\.cache\kagglehub\datasets\soumikrakshit\classical-music-midi\versions\1


In [6]:
# Define the path to all midi files
cwd = os.getcwd()
midi_files = [
    Path(midi_folder) / file
    for file in os.listdir(midi_folder)
    if file.endswith(".mid")
]

In [7]:
# Define tokenizer to use REMI+
tokenizer: MusicTokenizer = REMI(
    tokenizer_config=TokenizerConfig(
        use_programs=True,
        one_token_stream_for_programs=True,
        use_time_signatures=True,
    )
)  # type: ignore

  super().__init__(tokenizer_config, params)


In [8]:
# Train the tokenizer on the midi files
tokenizer.train(
    vocab_size=tokenizer.vocab_size,
    model="BPE",
    files_paths=midi_files,
)

  tokenizer.train(


In [9]:
# Tokenize the midi files
tokenized_midi_files = []
for file in midi_files:
    tok_seq: TokSequence = tokenizer.encode(file) # type: ignore    
    tokenized_midi_files.append(tok_seq.ids) # using ids to get the integer representation of the tokens can covert back with decode

# Convert the list to a NumPy array if needed
tokenized_midi_files = np.array(tokenized_midi_files, dtype=object)
print(tokenized_midi_files[0])

[4, 483, 189, 346, 20, 104, 125, 190, 346, 27, 100, 125, 346, 39, 101, 129, 192, 346, 36, 101, 125, 346, 44, 105, 125, 193, 346, 48, 101, 125, 346, 51, 106, 127, 194, 346, 44, 103, 125, 196, 346, 41, 99, 125, 346, 53, 103, 125, 197, 346, 19, 105, 125, 198, 346, 27, 102, 125, 346, 39, 104, 129, 200, 346, 37, 101, 125, 346, 46, 105, 125, 201, 346, 49, 102, 125, 346, 51, 107, 127, 202, 346, 46, 104, 125, 204, 346, 41, 101, 125, 346, 53, 104, 125, 205, 346, 20, 106, 125, 206, 346, 27, 104, 125, 346, 39, 106, 129, 208, 346, 36, 104, 125, 346, 44, 107, 125, 209, 346, 48, 104, 125, 346, 51, 109, 127, 210, 346, 44, 108, 125, 212, 346, 41, 105, 125, 346, 53, 109, 125, 213, 346, 24, 112, 125, 214, 346, 36, 106, 125, 346, 43, 109, 129, 216, 346, 39, 106, 125, 346, 48, 110, 125, 217, 346, 51, 105, 125, 346, 55, 111, 127, 218, 346, 48, 110, 125, 220, 346, 44, 108, 125, 346, 56, 112, 125, 4, 483, 189, 346, 25, 113, 125, 190, 346, 36, 109, 125, 346, 48, 112, 129, 192, 346, 41, 107, 125, 346, 53, 111,

In [10]:
max_seq_len = 512
stride = max_seq_len // 2

def create_chunks(tokens: list[int], max_seq_len: int, stride: int) -> list[list[int]]:
    chunks = []
    for i in range(0, len(tokens) - max_seq_len, stride):
        chunk = tokens[i:i + max_seq_len]
        if len(chunk) != max_seq_len:
            print("error")
        chunks.append(chunk)
        
    return chunks

# get a list of all the chunks regardless of the midi file
tokenized_chunks = []
for tok_seq in tokenized_midi_files:
    chunks = create_chunks(tok_seq, max_seq_len, stride)
    
    # add the chunks to the list of chunks
    tokenized_chunks.extend(chunks)
    
# convert the list to a NumPy array
tokenized_chunks = np.array(tokenized_chunks)


In [11]:
# get the input and target data from the chunks using a autoregressive model
input_data = [chunk[:-1] for chunk in tokenized_chunks]
target_data = [chunk[1:] for chunk in tokenized_chunks]

# convert the input and target data to tensors
input_tensor = torch.tensor(input_data, dtype=torch.long)
target_tensor = torch.tensor(target_data, dtype=torch.long)

  input_tensor = torch.tensor(input_data, dtype=torch.long)


In [12]:
from torch.utils.data import Dataset

# Spilt the data into training and validation and testing sets for inputs and targets
train_input, vt_input, train_target, vt_target = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=seed)
val_input, test_input, val_target, test_target = train_test_split(vt_input, vt_target, test_size=0.5, random_state=seed)

# Create the dataset class
class MidiDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx], "labels": self.targets[idx]}
    
# Create the datasets
train_dataset = MidiDataset(train_input, train_target)
val_dataset = MidiDataset(val_input, val_target)
test_dataset = MidiDataset(test_input, test_target)

In [13]:
from transformers import GPT2LMHeadModel, GPT2Config

# Define model configuration
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,  # Total number of unique tokens
    n_positions=max_seq_len,  # Max sequence length
    n_embd=768,  # Embedding size
    n_layer=12,  # Number of transformer layers
    n_head=12   # Number of attention heads
)

# Initialize the model
model = GPT2LMHeadModel(config)


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',  # Output directory to save model and logs
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=8,   # Batch size per device during evaluation
    warmup_steps=500,    # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,   # Strength of weight decay
    logging_dir='./logs', # Directory for storing logs
    logging_steps=10,    # Log every 10 steps
    eval_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when finished training
    metric_for_best_model="loss",  # Metric to track for best model
)

trainer = Trainer(
    model=model,  # Your GPT2 model
    args=training_args,  # The training arguments
    train_dataset=train_dataset,  # The training set (TensorDataset)
    eval_dataset=val_dataset,  # The validation set (TensorDataset)
)


In [15]:
# define the final model path
final_model_path = f"./{artist}_final_model"

# check if the final model is saved
if not os.path.exists(final_model_path):
    trainer.train()
    trainer.save_model(final_model_path)  # Save the trained model

In [16]:
from transformers import GPT2LMHeadModel, Trainer

# load the final model from the saved model
model = GPT2LMHeadModel.from_pretrained(final_model_path)

# evaluate the model on the test dataset using the loaded model
trainer = Trainer(
    model=model,  # The model to be evaluated
    args=training_args,  # The evaluation arguments
    eval_dataset=test_dataset,  # The test dataset
)

# Evaluate the model on the test dataset
trainer.evaluate(test_dataset)

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 2.226705551147461,
 'eval_model_preparation_time': 0.0,
 'eval_runtime': 5.1968,
 'eval_samples_per_second': 28.094,
 'eval_steps_per_second': 3.656}

In [82]:
# load the final model from the saved model
model = GPT2LMHeadModel.from_pretrained(final_model_path)

# get the first 512 tokens from the test dataset
seed_tokens = test_dataset[100]["input_ids"].unsqueeze(0)

# create the input_ids tensor
input_ids = seed_tokens

# function to generate 1 new id at end of the input_ids tensor
def generated_ids(ids):
    # Generate tokens
    generated_ids = model.generate(
        input_ids=ids,  # Your seed sequence
        max_length=max_seq_len,  # Maximum length of generated song (in tokens)
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Prevent repeating n-grams
        do_sample=True,  # Enable sampling
        temperature=1.0,  # Sampling temperature for randomness (higher = more randomness)
        top_p=0.95,  # Use top-p sampling for diversity
        top_k=50,  # Use top-k sampling for diversity
        pad_token_id=tokenizer.pad_token_id,  # Padding token id if needed
    )

    return generated_ids[0].tolist()  # Get the generated tokens
    


In [83]:
from tqdm import tqdm

# for loop to generate new tokens
final_tokens = input_ids[0].tolist() # initial length of 511
new_ids = input_ids
for i in tqdm(range(100), desc="Generating tokens"):
    # get the new tokens
    new_tokens = generated_ids(new_ids)
    
    # check the length of the new tokens
    if len(new_tokens) != 512:
        print(f"error: {len(new_tokens)}")
        break
    
    # check the previous tokens equal the new tokens
    if final_tokens[i+1:] != new_tokens[1:-1]:
        print(final_tokens[i+1:])
        print(new_tokens[1:-1])
        print("error: tokens are not the same")
        break
    
    # get the new token
    new_token = new_tokens[-1]
    
    # add the new token to the final tokens
    final_tokens.append(new_token)
    
    # shift the window by one token
    new_ids = torch.tensor(final_tokens[(i+1):]).unsqueeze(0)
    
    # make sure the input_ids length is length 511
    if len(new_ids[0]) != 511:
        print(f"error: {len(new_ids)}")
        break
        
# print the final tokens length
print(len(final_tokens))
    

Generating tokens: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]

611





In [84]:
from symusic.core import ScoreTick
print(final_tokens)
print(input_ids[0].tolist())

decoded_tokens = []
for token in final_tokens:
    decoded_token = tokenizer.token_id_type(id_=token)
print(decoded_tokens)

# get the tokens from the generated tokens
midi_sequence: ScoreTick = tokenizer.decode(final_tokens) 
print(midi_sequence)

# get the path to the output file
midi_file = "multi_generated_midi.mid"
midi_path = os.path.join(cwd, midi_file)

# save the midi sequence to a midi file
midi_sequence.dump_midi(midi_path)

# re

# also decode the input_ids to get the original midi sequence
input_midi_sequence: ScoreTick = tokenizer.decode(seed_tokens[0].tolist())
print(input_midi_sequence)

# get the path to the input file
input_midi_file = "input_midi.mid"
input_midi_path = os.path.join(cwd, input_midi_file)

# save the input midi sequence to a midi file
input_midi_sequence.dump_midi(input_midi_path)


[105, 126, 209, 346, 52, 106, 126, 211, 346, 43, 105, 126, 213, 346, 48, 106, 126, 215, 346, 43, 105, 126, 217, 346, 40, 106, 126, 219, 346, 31, 105, 126, 4, 483, 189, 346, 17, 107, 156, 346, 29, 109, 156, 191, 346, 36, 105, 126, 193, 346, 41, 106, 126, 195, 346, 45, 105, 126, 197, 346, 51, 108, 126, 199, 346, 48, 105, 126, 201, 346, 53, 106, 126, 203, 346, 57, 105, 126, 205, 346, 63, 108, 126, 207, 346, 60, 105, 126, 209, 346, 65, 106, 126, 211, 346, 69, 105, 126, 213, 346, 75, 108, 126, 215, 346, 69, 105, 126, 217, 346, 65, 107, 126, 219, 346, 60, 105, 126, 4, 483, 189, 346, 75, 109, 126, 346, 22, 107, 156, 346, 34, 110, 156, 191, 346, 68, 105, 126, 193, 346, 65, 107, 126, 195, 346, 60, 105, 126, 197, 346, 63, 109, 126, 199, 346, 56, 106, 126, 201, 346, 53, 107, 126, 203, 346, 48, 106, 126, 205, 346, 51, 109, 126, 207, 346, 44, 106, 126, 209, 346, 41, 107, 126, 211, 346, 36, 106, 126, 213, 346, 50, 108, 126, 215, 346, 44, 106, 126, 217, 346, 41, 106, 126, 219, 346, 34, 105, 126, 4, 4