In [1]:
# %pip install miditok

In [2]:
import os
from pathlib import Path

from miditok import REMI
from miditok import MusicTokenizer, TokSequence
from miditok.classes import TokenizerConfig

import torch
import random
from random import shuffle

from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# set the random seed
seed = 42
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x207c0ddf2f0>

In [4]:
# set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [5]:
# download the dataset and set path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("soumikrakshit/classical-music-midi")

print("Path to dataset files:", path)

artist = "chopin"
midi_folder = os.path.join(path, artist)

Path to dataset files: C:\Users\ianmh\.cache\kagglehub\datasets\soumikrakshit\classical-music-midi\versions\1


In [6]:
# Define the path to all midi files
cwd = os.getcwd()
midi_files = [
    Path(midi_folder) / file
    for file in os.listdir(midi_folder)
    if file.endswith(".mid")
]

In [None]:
# Define tokenizer to use REMI+
tokenizer: MusicTokenizer = REMI(
    tokenizer_config=TokenizerConfig(
        use_programs=True,
        one_token_stream_for_programs=True,
        use_time_signatures=True,
    )
)  # type: ignore

In [None]:
# Train the tokenizer on the midi files
tokenizer.train(
    vocab_size=tokenizer.vocab_size,
    model="BPE",
    files_paths=midi_files,
)

In [None]:
# Tokenize the midi files
tokenized_midi_files = []
for file in midi_files:
    tok_seq: TokSequence = tokenizer.encode(file) # type: ignore
    tokenized_midi_files.append(tok_seq.ids) # using ids to get the integer representation of the tokens can covert back with decode

# Convert the list to a NumPy array if needed
tokenized_midi_files = np.array(tokenized_midi_files, dtype=object)
print(tokenized_midi_files[0])

In [None]:
max_seq_len = 512
stride = max_seq_len // 2

def create_chunks(tokens: list[int], max_seq_len: int, stride: int) -> list[list[int]]:
    chunks = []
    for i in range(0, len(tokens) - max_seq_len, stride):
        chunk = tokens[i:i + max_seq_len]
        if len(chunk) != max_seq_len:
            print("error")
        chunks.append(chunk)

    return chunks

# get a list of all the chunks regardless of the midi file
tokenized_chunks = []
for tok_seq in tokenized_midi_files:
    chunks = create_chunks(tok_seq, max_seq_len, stride)

    # add the chunks to the list of chunks
    tokenized_chunks.extend(chunks)

# convert the list to a NumPy array
tokenized_chunks = np.array(tokenized_chunks)


In [None]:
# get the input and target data from the chunks using a autoregressive model
input_data = [chunk[:-1] for chunk in tokenized_chunks]
target_data = [chunk[1:] for chunk in tokenized_chunks]

# convert the input and target data to tensors
input_tensor = torch.tensor(input_data, dtype=torch.long)
target_tensor = torch.tensor(target_data, dtype=torch.long)

In [None]:
from torch.utils.data import Dataset

# Spilt the data into training and validation and testing sets for inputs and targets
train_input, vt_input, train_target, vt_target = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=seed)
val_input, test_input, val_target, test_target = train_test_split(vt_input, vt_target, test_size=0.5, random_state=seed)

# Create the dataset class
class MidiDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx], "labels": self.targets[idx]}

# Create the datasets
train_dataset = MidiDataset(train_input, train_target)
val_dataset = MidiDataset(val_input, val_target)
test_dataset = MidiDataset(test_input, test_target)

In [13]:
from transformers import GPT2LMHeadModel, GPT2Config

# Define model configuration
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,  # Total number of unique tokens
    n_positions=max_seq_len,  # Max sequence length
    n_embd=768,  # Embedding size
    n_layer=12,  # Number of transformer layers
    n_head=12   # Number of attention heads
)

# Initialize the model
model = GPT2LMHeadModel(config)


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',  # Output directory to save model and logs
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=8,   # Batch size per device during evaluation
    warmup_steps=500,    # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,   # Strength of weight decay
    logging_dir='./logs', # Directory for storing logs
    logging_steps=10,    # Log every 10 steps
    eval_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when finished training
    metric_for_best_model="loss",  # Metric to track for best model
)

trainer = Trainer(
    model=model,  # Your GPT2 model
    args=training_args,  # The training arguments
    train_dataset=train_dataset,  # The training set (TensorDataset)
    eval_dataset=val_dataset,  # The validation set (TensorDataset)
)


In [15]:
# define the final model path
final_model_path = f"./{artist}_final_model"

# check if the final model is saved
if not os.path.exists(final_model_path):
    trainer.train()
    trainer.save_model(final_model_path)  # Save the trained model

In [None]:
from transformers import GPT2LMHeadModel, Trainer

# load the final model from the saved model
model = GPT2LMHeadModel.from_pretrained(final_model_path)

# evaluate the model on the test dataset using the loaded model
trainer = Trainer(
    model=model,  # The model to be evaluated
    args=training_args,  # The evaluation arguments
    eval_dataset=test_dataset,  # The test dataset
)

# Evaluate the model on the test dataset
trainer.evaluate(test_dataset)

In [None]:
# load the final model from the saved model
model = GPT2LMHeadModel.from_pretrained(final_model_path)

# get the first 512 tokens from the test dataset
seed_tokens = test_dataset[0]["input_ids"].unsqueeze(0)

# create the input_ids tensor
input_ids = seed_tokens

# function to generate 1 new id at end of the input_ids tensor
def generated_ids(ids):
    # Generate tokens
    generated_ids = model.generate(
        input_ids=ids,  # Your seed sequence
        max_length=max_seq_len,  # Maximum length of generated song (in tokens)
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Prevent repeating n-grams
        do_sample=True,  # Enable sampling
        temperature=1.0,  # Sampling temperature for randomness (higher = more randomness)
        top_p=0.95,  # Use top-p sampling for diversity
        top_k=50,  # Use top-k sampling for diversity
        pad_token_id=tokenizer.pad_token_id,  # Padding token id if needed
    )

    return generated_ids[0].tolist()  # Get the generated tokens



In [None]:
from symusic.core import ScoreTick

# generated midi
generated_midi = generated_ids(input_ids)

# get the tokens from the generated tokens
midi_sequence: ScoreTick = tokenizer.decode(generated_midi)
print(midi_sequence)

# get the path to the output file
midi_file = "multi_generated_midi.mid"
midi_path = os.path.join(cwd, midi_file)

# save the midi sequence to a midi file
midi_sequence.dump_midi(midi_path)