In [1]:
import numpy as np
import json
import torch
import torch.nn.functional as F

from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from torch.utils.data import Dataset

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
device

'cuda'

# Data

In [3]:
with open('data_words.json', 'r') as fp:
    data = json.load(fp)

song_list = []
for song in data:
    song_list.append(data[song])

print("Data length:", len(song_list))

song_sentences = [" ".join(song) for song in song_list]
print("Example song sentence:", song_sentences[0][:100])

Data length: 803
Example song sentence: Bar_None Position_3/16 Note-On_76 Note-Duration_2 Position_4/16 Note-On_74 Note-Duration_2 Position_


# Tokeinzer

In [4]:
tokenizer = GPT2Tokenizer(
    vocab_file="vocab.json", 
    merges_file="merges.txt")

tokenizer.add_special_tokens({'pad_token': 'PAD', 'bos_token': 'BOS', 'eos_token': 'EOS',})

print("vocab size: ", tokenizer.vocab_size)
print("PAD token: ", tokenizer.pad_token_id)
print("BOS token: ", tokenizer.bos_token_id)
print("EOS token: ", tokenizer.eos_token_id)

vocab size:  120
PAD token:  120
BOS token:  121
EOS token:  122


# Dataset Split

In [5]:
split_train_test = int(0.9*len(song_sentences))
print("data length:", len(song_sentences))
print("90% at:     ", split_train_test)

data length: 803
90% at:      722


In [6]:
train_data = song_sentences[:split_train_test]
eval_data = song_sentences[split_train_test:]

In [7]:
class CustomDatasetNew(Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.data[index].split(" "),
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
            truncation=True,
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Set padding tokens to -100 for language modeling
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [8]:
# Create an instance of your custom Dataset
train_dataset = CustomDatasetNew(tokenizer=tokenizer, data=train_data, max_length=32)
eval_dataset = CustomDatasetNew(tokenizer=tokenizer, data=eval_data, max_length=32)

# Define your data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [9]:
train_dataset[5]

{'input_ids': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]])}

# Model

In [52]:
model_hyperparameters = {
    "max_length": 128,
    "emb_dim": 128,
    "attention_heads": 4,
    "layers": 6,
    "dropout": 0.01,
    "learning_rate": 1e-4,
    "epochs": 100,
    "batch_size": 4,
}

model_dirs = {
    "logging": "logs",
    "out": "out/model_test"
}

model_data = {
    "train_dataset": train_dataset,
    "eval_dataset": eval_dataset,
    "data_collator": data_collator,
}

In [10]:
def train_gpt2_transformer(hyperparameters, tokenizer, data, dirs):

    # define model config and model
    config = GPT2Config(
        vocab_size = tokenizer.vocab_size,
        n_positions = hyperparameters["max_length"], # max seq length
        n_embd = hyperparameters["emb_dim"],
        n_head = hyperparameters["attention_heads"], 
        n_layer = hyperparameters["layers"],
        dropout = hyperparameters["dropout"],
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id
    )

    model = GPT2LMHeadModel(config)

    # define training arguments
    training_args = TrainingArguments(
        output_dir = dirs["out"],
        overwrite_output_dir = True,
        num_train_epochs = hyperparameters["epochs"],
        per_device_train_batch_size = hyperparameters["batch_size"],
        save_steps = 1000,
        save_total_limit = 5, # maximum number of models to save
        learning_rate = hyperparameters["learning_rate"], 
        #weight_decay=0.01, # You can adjust the weight decay as needed
        #warmup_steps=1_000, # Number of warmup steps for learning rate scheduling
        logging_dir = dirs["logging"],
        logging_steps = 100,
        seed = 4711,
        evaluation_strategy = "epoch",
        logging_strategy = "epoch"
    )

    # Create and train  Trainer
    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator = data["data_collator"],
        train_dataset = data["train_dataset"],
        eval_dataset = data["eval_dataset"]
    )
    
    return trainer

In [11]:
model_hyperparameters = {
    "max_length": 128,
    "emb_dim": 128,
    "attention_heads": 4,
    "layers": 6,
    "dropout": 0.01,
    "learning_rate": 1e-4,
    "epochs": 1,
    "batch_size": 4,
}

model_dirs = {
    "logging": "logs",
    "out": "out/model_test"
}

model_data = {
    "train_dataset": train_dataset,
    "eval_dataset": eval_dataset,
    "data_collator": data_collator,
}

In [12]:
trainer_test = train_gpt2_transformer(
    hyperparameters = model_hyperparameters,
    tokenizer = tokenizer,
    data = model_data,
    dirs = model_dirs,
)

In [13]:
trainer_test.train()



Epoch,Training Loss,Validation Loss
1,3.5417,3.195796


TrainOutput(global_step=181, training_loss=3.5416782442377417, metrics={'train_runtime': 5.994, 'train_samples_per_second': 120.453, 'train_steps_per_second': 30.197, 'total_flos': 164947034112.0, 'train_loss': 3.5416782442377417, 'epoch': 1.0})

In [14]:
inputs = tokenizer.encode("Bar_None".split(" "), return_tensors="pt")
inputs = inputs.to(device)
inputs

tensor([[0]], device='cuda:0')

In [15]:
outputs = trainer_test.model.generate(inputs, max_length=20, temperature=0.8)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:122 for open-end generation.


tensor([[  0, 104, 119,  17,  38,   0, 103, 118,  17,  38,   0, 103, 118,  17,
          38,   0, 103, 118,  17,  38]], device='cuda:0')

In [17]:
output = trainer_test.model.generate(
    input_ids=inputs,
    max_length=100,  # maximum length of generated text
    do_sample=True,  # enable random sampling
    num_return_sequences=3  # number of generated sequences
)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:122 for open-end generation.


tensor([[  0, 103, 118,   8, 103, 102,  13,  48, 115,   5,  40, 114, 119,  18,
           8,  38, 109,  15,  52,  84, 112,  20,  44,  40, 111,  12,   8,  38,
         108, 119, 118, 114,  17, 111,  12, 119,  15,  40,  20,  38,  29,  38,
         117,  12,  78, 101, 107,  12,  29,  12,  68,  17,  40,  52, 103,  15,
          44,  13,  38,  22,  46,  15, 118, 104, 119,  11, 104,  25,  40,  77,
          65,  10,  40,  46,   8, 119,  60, 113,  19,  29,  38, 101, 117,  17,
          23,  25,  59,  52,  12, 100,  95,  10,  38, 108,  13, 115,  44, 101,
         113,  17],
        [  0,  94,  44, 101, 111, 119,  15,  52,  65,  38, 103,  22,  17, 101,
         116,  42, 113, 116,  13,  38, 114,  44, 115,  44, 101,  41,  13,  25,
         102, 116,  25,  42, 112, 119,   5,  44,  12, 102,  15,  29, 109,  10,
          10,  10, 105,  15, 112, 119,  26,  13, 107,  12,  59,  48, 101,  44,
         115,  64,  38, 110,  12,  19,  32,   0,  46,   2,   8,  19,  40, 111,
          92,  26, 104,  17, 103

In [18]:
tokenizer.decode(outputs[0], skip_special_tokens=False)

'Bar_NonePosition_3/16Position-Triole_2Note-On_76Note-Duration_2Bar_NonePosition_2/16Position-Triole_1Note-On_76Note-Duration_2Bar_NonePosition_2/16Position-Triole_1Note-On_76Note-Duration_2Bar_NonePosition_2/16Position-Triole_1Note-On_76Note-Duration_2'

In [19]:
# trainer.save_model("gpt_models/model1")

NameError: name 'trainer' is not defined

In [20]:
NOTE_TYPES_following = {
    "start": ["pos"],
    "start-pos": ["pitch", "ptriole"],
    "start-pos-ptriole": ["pitch"],
    "start-pos-pitch": ["duration"],
    "start-pos-ptriole-pitch": ["duration"],
    "start-pos-pitch-duration": ["dtriole"],
    "start-pos-pitch-duration-dtriole": [],
    "start-pos-ptriole-pitch-duration": ["dtriole"],
    "start-pos-ptriole-pitch-duration-dtriole": [],
}

def analyze_token_sequence(seq):
    counts = {note_type: 0 for note_type in NOTE_TYPES_following}
    current_note_type = "start"

    for token in seq:

        if token_type(token) in NOTE_TYPES_following[current_note_type]:
            current_note_type += "-" + token_type(token)
        else:
            counts[current_note_type] += 1
            if token_type(token) == "pos":
                current_note_type = "start-pos"
            else:
                current_note_type = "start"
    
    counts[current_note_type] += 1
    return counts

def token_type(token):
    if token in range(102, 118):
        return "pos"
    elif token in range(1, 37):
        return "pitch"
    elif token in range(118, 120):
        return "ptriole"
    elif token in range(37, 101):
        return "duration"
    elif token == 101:
        return "dtriole"
    elif token == 0:
        return "Bar"
    elif token in range(120, 123):
        return "Bar"
    else:
        raise ValueError("Invalid token: {}".format(token))

In [21]:
import pandas as pd

In [22]:
dic = {}
for idx, res in enumerate(output):
    an = analyze_token_sequence(res)
    dic[idx] = an

pd.DataFrame(dic)

Unnamed: 0,0,1,2
start,37,30,28
start-pos,2,9,11
start-pos-ptriole,1,1,1
start-pos-pitch,8,8,2
start-pos-ptriole-pitch,3,3,3
start-pos-pitch-duration,6,3,9
start-pos-pitch-duration-dtriole,1,0,2
start-pos-ptriole-pitch-duration,0,2,0
start-pos-ptriole-pitch-duration-dtriole,0,0,0


## Sequential Version

In [68]:
# Define GPT-2 model architecture
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128, # max seq length
    n_embd=128,
    n_head=4, 
    n_layer=6,
    dropout=0.01 
)
model = GPT2LMHeadModel(config)

In [69]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dropout": 0.01,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 128,
  "n_head": 4,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 128,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.27.4",
  "use_cache": true,
  "vocab_size": 120
}

In [71]:
# Define your training arguments
training_args = TrainingArguments(
    output_dir="out",
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_device_train_batch_size=4, # You can adjust the batch size per device as needed
    save_steps=1000,
    save_total_limit=5, # maximum number of models to save
    learning_rate=1e-4, # You can adjust the learning rate as needed
    #weight_decay=0.01, # You can adjust the weight decay as needed
    #warmup_steps=1_000, # Number of warmup steps for learning rate scheduling
    logging_dir='logs', # Directory to save the training logs
    logging_steps=100, # Number of steps to log training progress
    seed=4711, # Set a seed for reproducibility
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Create and train  Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [72]:
# Train the model
training = trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3629,2.717453
2,2.4229,2.26406
3,2.0975,2.017088
4,1.9011,1.843353
5,1.766,1.729854
6,1.6763,1.656317
7,1.608,1.599752
8,1.5655,1.550471
9,1.5202,1.523684
10,1.4959,1.499394


In [73]:
training

TrainOutput(global_step=5430, training_loss=1.5485621555953615, metrics={'train_runtime': 162.8006, 'train_samples_per_second': 133.046, 'train_steps_per_second': 33.354, 'total_flos': 4948411023360.0, 'train_loss': 1.5485621555953615, 'epoch': 30.0})

In [74]:
inputs = tokenizer.encode("Bar_None".split(" "), return_tensors="pt")
inputs = inputs.to(device)
inputs

tensor([[0]], device='cuda:0')

In [18]:
######## Variante A ########

In [26]:
outputs = model(inputs)
outputs.logits.shape

torch.Size([1, 1, 120])

In [27]:
outputs.logits

tensor([[[ 0.2547, -2.1192, -7.2306, -2.7882, -7.2182, -1.6130, -3.2042,
          -7.0037, -1.7453, -3.5268, -1.4314, -3.0444, -2.2273, -1.1538,
          -3.5951, -1.4390, -3.5149, -1.0911, -3.2377, -3.2269, -2.1793,
          -7.2091, -1.7342, -3.2474, -3.2308, -2.2438, -7.3862, -2.9835,
          -7.5360, -3.4471, -7.1169, -7.0729, -3.7748, -7.2140, -7.0705,
          -7.2860, -7.2197, -3.6261, -0.6181, -7.1656, -0.8093, -7.2236,
          -3.1740, -7.2032, -1.3843, -7.2304, -2.8611, -7.3672, -2.5701,
          -7.2789, -3.4928, -7.2448, -2.5277, -7.3382, -2.8937, -7.1164,
          -3.2698, -7.4626, -3.0290, -7.2825, -4.7054, -7.3309, -3.6156,
          -3.8380, -3.5813, -4.7972, -3.9706, -7.2940, -1.2369, -7.3213,
          -3.5374, -4.9148, -4.2818, -7.1624, -4.3432, -7.1656, -2.3467,
          -7.1572, -3.5976, -4.0147, -3.6525, -7.1724, -7.3088, -7.2978,
          -4.0406, -7.5917, -7.1573, -7.2811, -7.2571, -7.2548, -7.3264,
          -7.1548, -2.0736, -7.1610, -7.2467, -7.28

In [28]:
# Temperature value
temperature = 13

# Convert logits to probabilities using softmax with temperature
probs = F.softmax(outputs.logits / temperature, dim=-1)

# Sample a token from the probability distribution for each position in the sequence
predicted_tokens = torch.multinomial(probs.view(-1, probs.shape[-1]), num_samples=1).view(*probs.shape[:-1])
predicted_tokens

tensor([[17]], device='cuda:0')

In [29]:
tokenizer.decode(predicted_tokens[0], skip_special_tokens=False)

'Note-On_76'

In [30]:
######## Variante B ########

In [79]:
#outputs = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
outputs = model.generate(inputs, max_length=20, temperature=0.8)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  0, 104, 119,   8,  38, 106,   8,  44, 101, 107, 118,   8,  44, 101,
         108, 119,   8,  38, 110,   8]], device='cuda:0')

In [89]:
tokenizer.decode(outputs[0], skip_special_tokens=False)

'Bar_NonePosition_3/16Position-Triole_2Note-On_67Note-Duration_2Position_5/16Note-On_67Note-Duration_8Note-Duration_triolePosition_6/16Position-Triole_1Note-On_67Note-Duration_8Note-Duration_triolePosition_7/16Position-Triole_2Note-On_67Note-Duration_2Position_9/16Note-On_67'

In [35]:
trainer.save_model("gpt_models/model1")

## Load model

In [30]:
model = GPT2LMHeadModel.from_pretrained("gpt_models/model1")

In [31]:
inputs = tokenizer.encode("Bar_None".split(" "), return_tensors="pt")
inputs = inputs.to("cpu")
inputs

tensor([[0]])

In [32]:
outputs = model.generate(inputs, max_length=20, temperature=0.8)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  0, 103, 118,  10,  40, 105,  10,  44, 101, 107, 118,  10,  38, 109,
          13,  38, 110,  10,  44, 101]])

In [33]:
output = model.generate(
    input_ids=inputs,
    max_length=100,  # maximum length of generated text
    do_sample=True,  # enable random sampling
    num_return_sequences=3  # number of generated sequences
)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  0, 105,  25,  38, 106,  24,  38, 107,  22,  42, 110,  20,  40, 113,
          22,  38, 114,  22,  68, 101,   0, 105,  22,  38, 106,  22,  38, 107,
          24,  40, 109,  22,  38, 109,  22,  38, 107,  24,  40, 109,  22,  40,
         109,  15,  38, 109,  20,  38, 109,  20,  38, 109,  22,  38, 109,  24,
          40, 113,  20,  42, 113,  20,  38,   0, 109,  22,  38, 101,   0, 103,
          22,  40, 109,  20,  38, 101,   0, 105,  20,  38, 101,   0, 105,  22,
          42, 113,  17,  38, 109,  22,  38,   0, 105,  24,  42, 113,  22,  38,
         101,   0],
        [  0, 100,  22,  52,   0, 105,  22,  40, 107,  17,  44, 111,  15,  44,
         115,  13,  44,   0, 103,  15,  44, 107,  13,  76,   0, 105,  17,  44,
         111,  20,  46, 115,  17,  44, 117,  20,  40, 113,  17,  44, 117,  20,
          44, 117,  20,  44,   0, 103,  22,  44,   0, 103,  15,  44, 107,   1,
          44, 111,  17,  44,   0, 103,  22,  44, 115,  13,  44,   0, 103,  10,
          44,   0, 103,  13,  52

In [34]:
dic = {}
for idx, res in enumerate(output):
    an = analyze_token_sequence(res)
    dic[idx] = an

pd.DataFrame(dic)

Unnamed: 0,0,1,2
start,2,5,1
start-pos,0,1,0
start-pos-ptriole,0,0,0
start-pos-pitch,0,0,0
start-pos-ptriole-pitch,0,0,1
start-pos-pitch-duration,24,26,2
start-pos-pitch-duration-dtriole,5,1,5
start-pos-ptriole-pitch-duration,0,1,12
start-pos-ptriole-pitch-duration-dtriole,0,0,4


In [11]:
tokens = [tensor.item() for tensor in list(outputs[0])]
tokens

[0,
 103,
 118,
 10,
 40,
 105,
 10,
 44,
 101,
 107,
 118,
 10,
 38,
 109,
 13,
 38,
 110,
 10,
 44,
 101]