# Train GPT-2 on music.

---

## Install dependencies.

In [2]:
!pip install transformers tokenizers datasets
!git config --global credential.helper store
!sudo apt-get install git-lfs


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
[sudo] password for sergi_carapuig: 


## Load the dataset from 🤗 Hub.

In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("TristanBehrens/js-fakes-4bars")
raw_datasets

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4016
    })
    test: Dataset({
        features: ['text'],
        num_rows: 463
    })
})

Let us look at an example.

In [6]:
raw_datasets["train"][0]

{'text': 'PIECE_START STYLE=JSFAKES GENRE=JSFAKES TRACK_START INST=0 BAR_START NOTE_ON=70 TIME_DELTA=4 NOTE_OFF=70 NOTE_ON=77 TIME_DELTA=4 NOTE_OFF=77 NOTE_ON=74 TIME_DELTA=4 NOTE_OFF=74 NOTE_ON=72 TIME_DELTA=2 NOTE_OFF=72 NOTE_ON=74 TIME_DELTA=2 NOTE_OFF=74 BAR_END BAR_START NOTE_ON=75 TIME_DELTA=4 NOTE_OFF=75 NOTE_ON=72 TIME_DELTA=8 NOTE_OFF=72 NOTE_ON=70 TIME_DELTA=4 NOTE_OFF=70 BAR_END BAR_START NOTE_ON=70 TIME_DELTA=4 NOTE_OFF=70 NOTE_ON=65 TIME_DELTA=4 NOTE_OFF=65 NOTE_ON=67 TIME_DELTA=4 NOTE_OFF=67 NOTE_ON=69 TIME_DELTA=4 NOTE_OFF=69 BAR_END BAR_START NOTE_ON=70 TIME_DELTA=2 NOTE_OFF=70 NOTE_ON=69 TIME_DELTA=2 NOTE_OFF=69 NOTE_ON=67 TIME_DELTA=8 NOTE_OFF=67 NOTE_ON=65 TIME_DELTA=4 NOTE_OFF=65 BAR_END TRACK_END TRACK_START INST=32 BAR_START NOTE_ON=58 TIME_DELTA=4 NOTE_OFF=58 NOTE_ON=57 TIME_DELTA=4 NOTE_OFF=57 NOTE_ON=58 TIME_DELTA=4 NOTE_OFF=58 NOTE_ON=53 TIME_DELTA=4 NOTE_OFF=53 BAR_END BAR_START NOTE_ON=51 TIME_DELTA=4 NOTE_OFF=51 NOTE_ON=53 TIME_DELTA=8 NOTE_OFF=53 NOTE_ON=4

## Train the tokenizer.

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import WordLevelTrainer

from transformers import PreTrainedTokenizerFast


tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
trainer = WordLevelTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]

training_corpus = get_training_corpus()
tokenizer.train_from_iterator(training_corpus, trainer=trainer)
tokenizer.save("tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

Inspect the vocabulary.

In [8]:
tokenizer.get_vocab()

{'NOTE_ON=53': 45,
 'NOTE_ON=40': 102,
 'NOTE_ON=39': 105,
 'NOTE_OFF=51': 56,
 'TIME_DELTA=3': 108,
 'NOTE_ON=67': 12,
 'NOTE_ON=47': 67,
 'STYLE=JSFAKES': 79,
 'NOTE_OFF=50': 54,
 'NOTE_ON=68': 29,
 'NOTE_OFF=44': 86,
 'NOTE_ON=59': 39,
 'NOTE_ON=77': 92,
 'NOTE_ON=79': 100,
 'NOTE_OFF=75': 68,
 'NOTE_ON=81': 116,
 'NOTE_ON=58': 33,
 'NOTE_ON=56': 43,
 'NOTE_OFF=71': 46,
 'NOTE_ON=54': 51,
 'NOTE_OFF=49': 64,
 'NOTE_OFF=40': 101,
 'NOTE_ON=65': 14,
 'NOTE_OFF=59': 38,
 'NOTE_OFF=67': 11,
 'NOTE_OFF=76': 83,
 'TIME_DELTA=10': 118,
 'NOTE_OFF=38': 109,
 'NOTE_ON=71': 47,
 '[UNK]': 0,
 'NOTE_OFF=43': 89,
 'TIME_DELTA=2': 6,
 'NOTE_ON=74': 63,
 '[MASK]': 4,
 'NOTE_ON=50': 55,
 'NOTE_OFF=57': 40,
 'TIME_DELTA=16': 103,
 'BAR_END': 7,
 'NOTE_ON=63': 23,
 'TRACK_END': 9,
 'PIECE_START': 78,
 'NOTE_OFF=66': 24,
 'NOTE_ON=78': 94,
 'NOTE_OFF=41': 95,
 'NOTE_OFF=81': 115,
 'TIME_DELTA=8': 19,
 'NOTE_OFF=37': 111,
 '[CLS]': 1,
 'NOTE_OFF=60': 17,
 'NOTE_OFF=53': 44,
 'NOTE_ON=60': 18,
 'NOTE_ON

## Train the model.

Here are the hyper- and model-parameters.

In [None]:
training_config = {
    "training_name": "jsfakes_mmmtrack_4bars_full",
    "dataset_name": "jsfakes_mmmtrack_4bars_full",
    "model": {
        "n_ctx": 512,
        "n_embd": 512,
        "n_head": 8,
        "n_layer": 6,
        "n_positions": 512
    },
    "trainer": {
        "pad_length": 512,
        "shuffle_buffer_size": 10000,
        "batch_size": 10,
        "num_train_epochs": 20,
        "save_steps": 300,
        "save_total_limit": 20
    }
}

Prepare the tokenization function.

In [None]:
def tokenize_function(example):
    tokenized_example = tokenizer(
        example["text"],
        truncation=True,
        padding=False,
        max_length=training_config["model"]["n_positions"],
    )
    return {
        "input_ids": tokenized_example["input_ids"]
    }

# Check a sample.
tokenized = tokenize_function(raw_datasets["train"][0])
assert list(tokenized.keys()) == ["input_ids"], list(tokenized.keys())

Create the tokenized dataset.

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=raw_datasets["train"].column_names)

# Check a sample.
tokenized = tokenized_datasets["train"][0]
assert list(tokenized.keys()) == ["input_ids"], list(tokenized.keys())
print(tokenized)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'input_ids': [78, 79, 72, 10, 73, 8, 37, 5, 36, 92, 5, 91, 63, 5, 62, 49, 6, 48, 63, 6, 62, 7, 8, 69, 5, 68, 49, 19, 48, 37, 5, 36, 7, 8, 37, 5, 36, 14, 5, 13, 12, 5, 11, 27, 5, 26, 7, 8, 37, 6, 36, 27, 6, 26, 12, 19, 11, 14, 5, 13, 7, 9, 10, 75, 8, 33, 5, 32, 41, 5, 40, 33, 5, 32, 45, 5, 44, 7, 8, 57, 5, 56, 45, 19, 44, 71, 5, 70, 7, 8, 33, 5, 32, 41, 5, 40, 35, 5, 34, 45, 6, 44, 57, 6, 56, 7, 8, 55, 6, 54, 59, 6, 58, 71, 5, 70, 59, 5, 58, 96, 5, 95, 7, 9, 10, 76, 8, 14, 5, 13, 14, 5, 13, 14, 5, 13, 14, 5, 13, 7, 8, 12, 5, 11, 14, 19, 13, 14, 5, 13, 7, 8, 14, 5, 13, 14, 5, 13, 16, 6, 15, 21, 6, 20, 14, 5, 13, 7, 8, 14, 5, 13, 14, 5, 13, 21, 5, 20, 18, 5, 17, 7, 9, 10, 74, 8, 16, 5, 15, 18, 5, 17, 33, 5, 32, 41, 5, 40, 7, 8, 33, 5, 32, 33, 5, 32, 41, 5, 40, 16, 5, 15, 7, 8, 16, 5, 15, 18, 5, 17, 33, 5, 32, 18, 5, 17, 7, 8, 33, 6, 32, 18, 6, 17, 16, 5, 15, 18, 6, 17, 33, 6, 32, 41, 5, 40, 7, 9, 77]}


Instantiate a data collator.

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Create the model.

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

model_config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    n_ctx=training_config["model"]["n_ctx"],
    n_embd=training_config["model"]["n_embd"],
    n_head=training_config["model"]["n_head"],
    n_layer=training_config["model"]["n_layer"],
    n_positions=training_config["model"]["n_positions"],
)
model = GPT2LMHeadModel(model_config)

# Test the data collator and the model.

In [None]:
inputs = [tokenized_datasets["train"][i] for i in range(5)]
inputs = data_collator(inputs)
assert list(inputs.keys()) == ["input_ids", "attention_mask", "labels"], list(inputs.keys())

outputs = model(**inputs)
assert list(outputs.keys()) == ["loss", "logits", "past_key_values"], list(outputs.keys())

---

Run the training.

In [None]:
from transformers import TrainingArguments, Trainer
import os

do_validation = True

# Get the output directory with timestamp.
output_path = "output"

# Create the trainer.
print("Creating trainer...")
steps = training_config["trainer"]["save_steps"]
training_args = TrainingArguments(
    output_dir=output_path,
    overwrite_output_dir=True,
    num_train_epochs=training_config["trainer"]["num_train_epochs"],
    evaluation_strategy="steps" if do_validation else "no",
    eval_steps=steps,
    per_device_train_batch_size=training_config["trainer"]["batch_size"],
    save_strategy="steps",
    save_steps=steps,
    save_total_limit=training_config["trainer"]["save_total_limit"],
    logging_strategy="steps",
    logging_first_step=True,
    logging_steps=steps,
    logging_dir=os.path.join(output_path, "logs"),
    prediction_loss_only=False,
    #load_best_model_at_end=True if do_validation else False
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"] if do_validation else None,
)

# Train the model.
trainer.train()

# Save the tokenizer.
tokenizer.save_pretrained(output_path)

# Save the model.
model.save_pretrained(output_path)

Creating trainer...


***** Running training *****
  Num examples = 4016
  Num Epochs = 20
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 8040


Step,Training Loss,Validation Loss
300,1.8161,1.116024
600,1.0417,0.874907
900,0.8827,0.812713
1200,0.8157,0.761966
1500,0.7709,0.732586
1800,0.7419,0.711401
2100,0.719,0.69695
2400,0.6993,0.68244
2700,0.6805,0.666539
3000,0.6656,0.661676


***** Running Evaluation *****
  Num examples = 463
  Batch size = 8
Saving model checkpoint to output/checkpoint-300
Configuration saved in output/checkpoint-300/config.json
Model weights saved in output/checkpoint-300/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 463
  Batch size = 8
Saving model checkpoint to output/checkpoint-600
Configuration saved in output/checkpoint-600/config.json
Model weights saved in output/checkpoint-600/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 463
  Batch size = 8
Saving model checkpoint to output/checkpoint-900
Configuration saved in output/checkpoint-900/config.json
Model weights saved in output/checkpoint-900/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 463
  Batch size = 8
Saving model checkpoint to output/checkpoint-1200
Configuration saved in output/checkpoint-1200/config.json
Model weights saved in output/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples

Generate some tokens.

In [None]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode("PIECE_START", return_tensors="pt")
input_ids = input_ids.to("cuda")
print(input_ids)

# generate text until the output length (which includes the context length) reaches 50#
generate = model.generate(input_ids, max_length=50)
generate

Thank you!