In [1]:
import numpy as np
import json
import torch
import torch.nn.functional as F

from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from torch.utils.data import Dataset

# Data

In [2]:
with open('data_words.json', 'r') as fp:
    data = json.load(fp)

song_list = []
for song in data:
    song_list.append(data[song])

len(song_list)

803

In [3]:
song_sentences = [" ".join(song) for song in song_list]
song_sentences[0][:100]

'Bar_None Position_3/16 Note-On_76 Note-Duration_2 Position_4/16 Note-On_74 Note-Duration_2 Position_'

# Model

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
device

'cuda'

In [5]:
tokenizer = GPT2Tokenizer(
    vocab_file="vocab.json", 
    merges_file="merges.txt")

In [6]:
tokenizer.vocab_size

120

In [7]:
tokenizer.add_special_tokens({'pad_token': 'PAD'})
tokenizer.pad_token_id

120

In [8]:
split_train_test = int(0.9*len(song_sentences))
print("data length:", len(song_sentences))
print("90% at:     ", split_train_test)

data length: 803
90% at:      722


In [9]:
train_data = song_sentences[:split_train_test]
eval_data = song_sentences[split_train_test:]

In [10]:
class CustomDatasetNew(Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.data[index].split(" "),
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
            truncation=True,
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Set padding tokens to -100 for language modeling
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [11]:
# Create an instance of your custom Dataset
train_dataset = CustomDatasetNew(tokenizer=tokenizer, data=train_data, max_length=32)
eval_dataset = CustomDatasetNew(tokenizer=tokenizer, data=eval_data, max_length=32)

# Define your data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
train_dataset[5]

{'input_ids': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]])}

In [13]:
# Define GPT-2 model architecture
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512, # max seq length
    n_embd=32,
    n_head=2, 
    n_layer=3,
    dropout=0.1 
)
model = GPT2LMHeadModel(config)

In [14]:
# Define your training arguments
training_args = TrainingArguments(
    output_dir="out",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4, # You can adjust the batch size per device as needed
    save_steps=1000,
    save_total_limit=5, # maximum number of models to save
    learning_rate=1e-4, # You can adjust the learning rate as needed
    #weight_decay=0.01, # You can adjust the weight decay as needed
    #warmup_steps=1_000, # Number of warmup steps for learning rate scheduling
    logging_dir='logs', # Directory to save the training logs
    logging_steps=100, # Number of steps to log training progress
    seed=4711, # Set a seed for reproducibility
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Create and train  Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [15]:
# Train the model
training = trainer.train()



Epoch,Training Loss,Validation Loss
1,4.4072,4.123715
2,3.9152,3.762995
3,3.613,3.505984
4,3.3814,3.298367
5,3.1986,3.139761
6,3.0598,3.025322
7,2.9619,2.946981
8,2.8954,2.892172
9,2.8517,2.862579
10,2.8311,2.852664


In [16]:
training

TrainOutput(global_step=1810, training_loss=3.311536824769078, metrics={'train_runtime': 34.781, 'train_samples_per_second': 207.585, 'train_steps_per_second': 52.04, 'total_flos': 52921098240.0, 'train_loss': 3.311536824769078, 'epoch': 10.0})

In [17]:
inputs = tokenizer.encode("Bar_None".split(" "), return_tensors="pt")
inputs = inputs.to(device)
inputs

tensor([[0]], device='cuda:0')

In [18]:
######## Variante A ########

In [19]:
outputs = model(inputs)
outputs.logits.shape

torch.Size([1, 1, 120])

In [20]:
outputs.logits

tensor([[[ 1.8716, -0.8818, -1.2642, -0.9056, -1.1093, -0.4502, -0.9754,
          -1.2616, -0.1503, -1.2166, -0.1434, -1.4028, -0.4292, -0.1614,
          -1.2045, -0.2191, -1.3437, -0.1735, -1.0774, -1.2823, -0.4968,
          -1.2509, -0.7181, -1.1255, -0.9473, -0.8294, -1.2645, -0.9667,
          -1.2132, -1.1592, -1.2247, -1.3238, -1.1587, -1.3723, -1.3077,
          -1.3144, -1.3356, -1.2003,  0.3147, -1.1871,  0.2662, -1.3798,
          -0.4691, -1.3352,  0.1222, -1.3819, -0.8457, -1.2799, -0.8460,
          -1.2346, -1.0648, -1.1097, -0.5477, -1.1242, -1.2166, -1.3411,
          -1.1694, -1.2744, -1.0797, -1.3393, -1.1808, -1.2164, -1.1597,
          -1.2358, -1.2140, -1.4151, -1.1245, -1.1122, -0.9653, -1.2103,
          -1.2045, -1.1862, -1.2442, -1.3524, -1.3124, -1.3874, -1.1369,
          -1.2478, -1.4003, -1.3019, -1.2208, -1.4746, -1.3512, -1.4884,
          -1.1901, -1.4030, -1.3098, -1.3627, -1.2748, -1.1804, -1.2501,
          -1.2360, -1.1162, -1.4035, -1.1791, -1.42

In [21]:
# Temperature value
temperature = 13

# Convert logits to probabilities using softmax with temperature
probs = F.softmax(outputs.logits / temperature, dim=-1)

# Sample a token from the probability distribution for each position in the sequence
predicted_tokens = torch.multinomial(probs.view(-1, probs.shape[-1]), num_samples=1).view(*probs.shape[:-1])
predicted_tokens

tensor([[23]], device='cuda:0')

In [22]:
tokenizer.decode(predicted_tokens[0], skip_special_tokens=False)

'Note-On_82'

In [23]:
######## Variante B ########

In [24]:
#outputs = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
outputs = model.generate(inputs, max_length=20, temperature=0.8)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  0,   0, 103,  17,  38,   0, 103,  17,  38,   0, 103,  17,  38,   0,
         103,  17,  38,   0, 103,  17]], device='cuda:0')

In [25]:
tokenizer.decode(outputs[0], skip_special_tokens=False)

'Bar_NoneBar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76'