<a href="https://colab.research.google.com/github/LeeScoresby69/GPT2_Music_Generation/blob/main/HuggingFace_GPT2_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![alt text](./LMU_Kopf.png "LMU")

# Masterarbeit

## im Studiengang
## Pädagogik mit Schwerpunkt Bildungsforschung und Bildungsmanagement
## an der LMU München


### Analyse und Generierung von motivischen Sequenzdaten – Eine Studie anhand von transformationsbasierten maschinellen Lernverfahren


4. Fachsemester


Verfasserin: |   | Betreuer:
--- | --- | ---
 Laura Katharina Achatz |       | Prof. Dr. Marcus Spies
 Martrikelnummer: 11729089 |     | Raum 3107
 Rosenstraße 6 |   | Leopoldstraße 13
 85778 Haimhausen |   | 80802 München
 laura.achatz@campus.lmu.de |    | marcus.spies@lmu.de


Abgabedatum: 11.09.2023

# HuggingFace GPT2

$$
@inproceedings{miditok2021,
    title={{MidiTok}: A Python package for {MIDI} file tokenization},
    author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas},
    booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference},
    year={2021},
    url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf},
}
$$

This notebook shows how to train a model (GPT2) and generate music from it, using the Hugging Face Transformers package.

## Setup Environment

***Install all dependencies (run only once per session)***

In [1]:

!nvidia-smi

!pip install miditok
!pip install miditoolkit
!pip install torch
!pip install torchtoolkit
!pip install transformers
!pip install evaluate
!pip install tqdm
!pip install accelerate -U


from typing import List, Tuple, Dict, Callable, Any, Union
from functools import partial
from pathlib import Path
from copy import deepcopy
import json

from torch import Tensor, LongTensor, stack, flip, cat, full, argmax
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtoolkit.data import create_subsets
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig
from transformers.data.data_collator import DataCollatorMixin
from evaluate import load as load_metric
from miditok import REMI, MIDITokenizer, TokenizerConfig
from miditok.constants import CHORD_MAPS
from miditoolkit import MidiFile
from tqdm import tqdm


Mon Aug 14 05:38:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil
shutil.copytree('drive/MyDrive/Colab_Data/dataset','dataset/', )

'dataset/'

## Define Class for Dataset

In [4]:

class MIDIDataset(Dataset):
    r"""Dataset for generator training

    :param files_paths: list of paths to files to load.
    :param tokenizer: tokenizer object, to use to load MIDIs instead of tokens. (default: None)
    """

    def __init__(self, files_paths: List[Path], min_seq_len: int, max_seq_len: int, tokenizer: MIDITokenizer = None):
        samples = []

        for file_path in tqdm(files_paths, desc=f'Loading data: {files_paths[0].parent}'):
            if file_path.suffix in ["mid", "midi", "MID", "MIDI"]:
                midi = MidiFile(file_path)
                for _ in range(len(midi.instruments) - 1):
                    del midi.instruments[1]  # removes all tracks except first one
                tokens = tokenizer.midi_to_tokens(midi)[0].ids
            else:
                with open(file_path) as json_file:
                    tokens = json.load(json_file)['ids'][0]  # first track
            i = 0
            while i < len(tokens):
                if i >= len(tokens) - min_seq_len:
                    break  # last sample is too short
                samples.append(LongTensor(tokens[i:i + max_seq_len]))
                i += len(samples[-1])  # could be replaced with max_seq_len

        self.samples = samples

    def __getitem__(self, idx) -> Dict[str, LongTensor]:
        return {"input_ids": self.samples[idx], "labels": self.samples[idx]}

    def __len__(self) -> int: return len(self.samples)

    def __repr__(self): return self.__str__()

    def __str__(self) -> str: return 'No data loaded' if len(self) == 0 else f'{len(self.samples)} samples'


def _pad_batch(examples: List[Dict[str, LongTensor]], pad_token: int) -> LongTensor:
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""

    length_of_first = examples[0]["input_ids"].size(0)

    # Check if padding is necessary.
    are_tensors_same_length = all(x["input_ids"].size(0) == length_of_first for x in examples)
    if are_tensors_same_length:
        return stack([e["input_ids"] for e in examples], dim=0).long()

    # Creating the full tensor and filling it with our data.
    return pad_sequence([e["input_ids"] for e in examples], batch_first=True, padding_value=pad_token).long()


class DataCollatorGen(DataCollatorMixin):
    def __init__(self, pad_token: int, return_tensors: str = "pt"):
        """Collator that simply pad the input sequences.
        Input_ids will be padded with the pad token given, while labels will be
        padded with -100.

        :param pad_token: pas token
        :param return_tensors:
        """
        self.pad_token = pad_token
        self.return_tensors = return_tensors

    def __call__(self, batch: List[Dict[str, Any]], return_tensors=None) -> Dict[str, LongTensor]:
        x, y = _pad_batch(batch, self.pad_token), _pad_batch(batch, -100)
        return {"input_ids": x, "labels": y}  # will be shifted in GPT2LMHead forward

## Convert MIDI files to tokens, and load them for training

In [12]:
# Our parameters for Tokenizer
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "nb_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": False,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": True,
    "nb_tempos": 32,  # nb of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
# Define config for Tokenizer
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer with REMI
tokenizer = REMI(config)

# config = TokenizerConfig(nb_velocities=16, use_chords=False, use_rests=True)

# Path to Midis w/o Byte Pair Encoding
tokens_no_bpe_path = Path('Midi_tokens_no_bpe')
# Path to Midis w Byte Pair Encoding
tokens_bpe_path = Path('Midi_tokens_bpe')

# Path to input Midi files
midi_path = Path('dataset')

# List of all input Midi files
midi_paths = list(midi_path.glob('**/*.mid')) + list(midi_path.glob('**/*.midi'))

# Perform data augmentation on a whole dataset
#data_augmentation_offsets = [2, 1, 1]  # data augmentation on 2 pitch octaves, 1 velocity and 1 duration values
data_augmentation_offsets = [3, 2, 2]  # data augmentation on 2 pitch octaves, 1 velocity and 1 duration values

# convert MIDIs to tokens
tokenizer.tokenize_midi_dataset(midi_paths, tokens_no_bpe_path,data_augment_offsets=data_augmentation_offsets)

# Learn and apply BPE to data we just tokenized
tokens_bpe_path.mkdir(exist_ok=True, parents=True)
tokenizer.learn_bpe(
    #vocab_size=10000,
    vocab_size=1000,
    tokens_paths=list(tokens_no_bpe_path.glob("**/*.json")),
    start_from_empty_voc=False,
)
tokenizer.apply_bpe_to_dataset(
    tokens_no_bpe_path,
    tokens_bpe_path,
)

# Saving our tokenizer, to retrieve it back later with the load_params method
tokenizer.save_params(Path("tokenizer", "tokenizer.json"))


# Loads tokens and create data loaders for training
tokens_paths = list(tokens_bpe_path.glob("**/*.json"))
dataset = MIDIDataset(
    tokens_paths, max_seq_len=256, min_seq_len=128,
)
subset_train, subset_valid = create_subsets(dataset, [0.3])

Tokenizing MIDIs (Midi_tokens_no_bpe): 100%|██████████| 108/108 [00:02<00:00, 36.42it/s]
Performing data augmentation: 100%|██████████| 108/108 [00:01<00:00, 85.30it/s]
Loading token files: 100%|██████████| 1425/1425 [00:00<00:00, 4158.06it/s]
Applying BPE to dataset: 100%|██████████| 1425/1425 [00:03<00:00, 421.82it/s]
Loading data: Midi_tokens_bpe: 100%|██████████| 1425/1425 [00:00<00:00, 6215.11it/s]


## Create the model

We will use the [GPT2 implementation of Hugging Face](https://huggingface.co/docs/transformers/model_doc/gpt2). This
Feel free to explore the documentation and source code to dig deeper.

In [13]:
# Creates model
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=2048,
    n_embd=512,
    n_layer=8,
    n_head=8,
    n_inner=2048,
    resid_pdrop=.1,
    embd_pdrop=.1,
    attn_pdrop=.1,
    padding_token_id=tokenizer['PAD_None'],
    bos_token_id=tokenizer['BOS_None'],
    eos_token_id=tokenizer['EOS_None'],
)
model = GPT2LMHeadModel(config)

## Train it

In [15]:
metrics = {metric: load_metric(metric) for metric in ["accuracy"]}

def compute_metrics(eval_pred):
    """Computes metrics for pretraining.
    Must use proprocess_logits function that converts logits to predictions (argmax or sampling).

    :param eval_pred: EvalPrediction containing predictions and labels
    :return: metrics
    """
    predictions, labels = eval_pred
    not_pad_mask = labels != -100
    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
    return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())

def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
    """Preprocesses the logits before accumulating them during evaluation.
    This allows to significantly reduce the memory usage and make the training tractable.
    """
    pred_ids = argmax(logits, dim=-1)  # long dtype
    return pred_ids

training_config = TrainingArguments(
    "runs", False, True, True, False, "steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=48,
    gradient_accumulation_steps=3,
    eval_accumulation_steps=None,
    eval_steps=1000,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=100000,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    no_cuda=False,
    seed=444,
    fp16=False,
    load_best_model_at_end=True,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=DataCollatorGen(tokenizer["PAD_None"]),
    train_dataset=subset_train,
    eval_dataset=subset_valid,
    compute_metrics=compute_metrics,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 16
***** Running training *****
  Num examples = 1,957
  Num Epochs = 2,440
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 3
  Total optimization steps = 100,000
  Number of trainable parameters = 26,780,672


Step,Training Loss,Validation Loss,Accuracy
1000,5.1044,4.73155,0.000662
2000,4.3613,3.97923,3.1e-05
3000,3.6414,3.383149,0.0
4000,2.9365,2.560403,2e-05
5000,2.1686,2.038081,1e-05
6000,1.659,1.762542,5e-06
7000,1.2639,1.553847,0.0
8000,0.9119,1.366958,0.0
9000,0.6134,1.21874,0.0
10000,0.3966,1.107481,0.0


***** Running Evaluation *****
  Num examples = 838
  Batch size = 48
Saving model checkpoint to runs/checkpoint-1000
Configuration saved in runs/checkpoint-1000/config.json
Configuration saved in runs/checkpoint-1000/generation_config.json
Model weights saved in runs/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 838
  Batch size = 48
Saving model checkpoint to runs/checkpoint-2000
Configuration saved in runs/checkpoint-2000/config.json
Configuration saved in runs/checkpoint-2000/generation_config.json
Model weights saved in runs/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 838
  Batch size = 48
Saving model checkpoint to runs/checkpoint-3000
Configuration saved in runs/checkpoint-3000/config.json
Configuration saved in runs/checkpoint-3000/generation_config.json
Model weights saved in runs/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 838
  Batch size = 48
Saving model checkpoi

***** train metrics *****
  epoch                    =     2439.02
  total_flos               = 172204656GF
  train_loss               =      0.2545
  train_runtime            =  9:42:32.98
  train_samples_per_second =     137.327
  train_steps_per_second   =       2.861


## Generate music

In [16]:
def collate_gen_left(batch: List[Dict[str, LongTensor]]) -> LongTensor:
    # Here the sequences are padded to the left, so that the last token along the time dimension
    # is always the last token of each seq, allowing to efficiently generate by batch
    bos_shape = (1,)
    batch = [flip(cat([full(bos_shape, tokenizer["BOS_None"]), seq["input_ids"]], dim=0), dims=(0,)) for seq in batch]
    batch = pad_sequence(batch, batch_first=True, padding_value=tokenizer["PAD_None"])  # (N,T) or (N,T,Z)
    batch = flip(batch, dims=(1,)).long()
    return batch  # (N,T)

generation_config = GenerationConfig(
    max_new_tokens=256,  # extends samples by 512 tokens
    num_beams=1,        # no beam search
    do_sample=True,     # but sample instead
    temperature=0.9,
    top_k=15,
    top_p=0.95,
    epsilon_cutoff=3e-4,
    eta_cutoff=1e-3,
    pad_token_id=config.padding_token_id,
)

(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)
dataloader_test = DataLoader(subset_valid, batch_size=16, collate_fn=collate_gen_left)
model.eval()
count = 0
for batch in tqdm(dataloader_test, desc='Testing model / Generating results'):  # (N,T)
    res = model.generate(batch.to(model.device), generation_config=generation_config)  # (N,T)

    # Saves the generated music, as MIDI files and tokens (json)
    for prompt, continuation in zip(batch, res):
        generated = continuation[len(prompt):]
        tokens = [generated, prompt, continuation]  # list compr. as seqs of dif. lengths
        tokens = [seq.tolist() for seq in tokens]
        midi = tokenizer.tokens_to_midi(deepcopy(tokens), time_division=384)
        midi.instruments[0].name = f'Continuation of original sample ({len(generated)} tokens)'
        midi.instruments[1].name = f'Original sample ({len(prompt)} tokens)'
        midi.instruments[2].name = f'Original sample and continuation'
        midi.dump(gen_results_path / f'{count}.mid')
        tokenizer.save_tokens(tokens, gen_results_path / f'{count}.json')

        count += 1

Testing model / Generating results: 100%|██████████| 53/53 [02:32<00:00,  2.87s/it]


In [17]:
shutil.copytree('gen_res/', 'drive/MyDrive/Colab_Data/gen_res_final')

'drive/MyDrive/Colab_Data/gen_res_final'

In [18]:
shutil.copytree('runs/', 'drive/MyDrive/Colab_Data/runs_final')

'drive/MyDrive/Colab_Data/runs_final'

In [19]:
shutil.copytree('Midi_tokens_bpe/', 'drive/MyDrive/Colab_Data/Midi_tokens_bpe_final')
shutil.copytree('Midi_tokens_no_bpe/', 'drive/MyDrive/Colab_Data/Midi_tokens_no_bpe_final')
shutil.copytree('tokenizer/', 'drive/MyDrive/Colab_Data/tokenizer_final')

'drive/MyDrive/Colab_Data/tokenizer_final'