In [1]:
import numpy as np
import json
import torch
import torch.nn.functional as F

## Data Prep

In [2]:
with open('words_to_tokens.json', 'r') as fp:
    tokens = json.load(fp)

list(tokens.keys())[:10]

['Bar_None',
 'Note-On_60',
 'Note-On_61',
 'Note-On_62',
 'Note-On_63',
 'Note-On_64',
 'Note-On_65',
 'Note-On_66',
 'Note-On_67',
 'Note-On_68']

In [3]:
#tokens["[UNK]"] = 120
#tokens["[CLS]"] = 121
#tokens["[SEP]"] = 122
#tokens["[PAD]"] = 123
#tokens["[MASK]"] = 124
tokens

{'Bar_None': 0,
 'Note-On_60': 1,
 'Note-On_61': 2,
 'Note-On_62': 3,
 'Note-On_63': 4,
 'Note-On_64': 5,
 'Note-On_65': 6,
 'Note-On_66': 7,
 'Note-On_67': 8,
 'Note-On_68': 9,
 'Note-On_69': 10,
 'Note-On_70': 11,
 'Note-On_71': 12,
 'Note-On_72': 13,
 'Note-On_73': 14,
 'Note-On_74': 15,
 'Note-On_75': 16,
 'Note-On_76': 17,
 'Note-On_77': 18,
 'Note-On_78': 19,
 'Note-On_79': 20,
 'Note-On_80': 21,
 'Note-On_81': 22,
 'Note-On_82': 23,
 'Note-On_83': 24,
 'Note-On_84': 25,
 'Note-On_85': 26,
 'Note-On_86': 27,
 'Note-On_87': 28,
 'Note-On_88': 29,
 'Note-On_89': 30,
 'Note-On_90': 31,
 'Note-On_91': 32,
 'Note-On_92': 33,
 'Note-On_93': 34,
 'Note-On_94': 35,
 'Note-On_95': 36,
 'Note-Duration_1': 37,
 'Note-Duration_2': 38,
 'Note-Duration_3': 39,
 'Note-Duration_4': 40,
 'Note-Duration_5': 41,
 'Note-Duration_6': 42,
 'Note-Duration_7': 43,
 'Note-Duration_8': 44,
 'Note-Duration_9': 45,
 'Note-Duration_10': 46,
 'Note-Duration_11': 47,
 'Note-Duration_12': 48,
 'Note-Duration_13

In [4]:
with open("vocab.txt", "w") as txt_file:
    for word in list(tokens.keys()) + ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]:
        txt_file.write("".join(word) + "\n")

In [5]:
with open('vocab.json', 'w') as fp:
    json.dump(tokens, fp, sort_keys=False)

In [6]:
with open('data_words.json', 'r') as fp:
    data = json.load(fp)

song_list = []
for song in data:
    song_list.append(data[song])

len(song_list)

803

In [7]:
with open('../data.json', 'r') as fp:
    data = json.load(fp)

token_list = []
for song in data:
    token_list.append(data[song])

len(token_list)

803

In [8]:
as_sentences = [" ".join(song) for song in song_list]
as_sentences[0][:100]

'Bar_None Position_3/16 Note-On_76 Note-Duration_2 Position_4/16 Note-On_74 Note-Duration_2 Position_'

In [9]:
with open("output.txt", "w", encoding='utf-8') as txt_file:
    for line in as_sentences[:-1]:
        txt_file.write("".join(line) + "\n")
    txt_file.write("".join(as_sentences[-1]))

## Test Tokenizer variations

### BPE und Whitespace

In [10]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace

In [11]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

tokenizer.add_tokens(list(tokens.keys()))
tokenizer.add_special_tokens(["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

0

In [12]:
tokenizer.get_vocab_size()

125

In [13]:
# testing

output = tokenizer.encode(as_sentences[1], add_special_tokens=True)
output

Encoding(num_tokens=1352, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [14]:
output.ids[:20]

[0, 112, 5, 38, 113, 5, 38, 114, 5, 38, 116, 5, 38, 117, 5, 38, 0, 102, 5, 38]

In [15]:
tokenizer.id_to_token(123)

'[PAD]'

In [16]:
#tokenizer.enable_padding(pad_id=126, pad_token="[PAD]")

In [17]:
tokenizer.get_vocab_size()

125

In [18]:
tokenizer.save("tokenizer.pkl")

### custom

In [None]:
import torch
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel,PreTrainedTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Define your own vocabulary
vocabulary = ["this", "is", "sentence", "1", "2", "3", "."]

# Create a custom tokenizer by inheriting from PreTrainedTokenizer
class CustomTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab):
        self.vocab = vocab
        #self.vocab_size = len(vocab)
        self.pad_token = "<pad>"
        self.eos_token = "<eos>"
        self.unk_token = "<unk>"
        self.cls_token = "<cls>"
        self.mask_token = "<mask>"

        self.vocab_dict = {v: i for i, v in enumerate(self.vocab)}

    def __call__(self, text):
        # Implement tokenization logic here
        tokens = text.split()
        return tokens

    def convert_tokens_to_ids(self, tokens):
        # Implement conversion from tokens to ids here
        return [self.vocab_dict[t] for t in tokens]

    def convert_ids_to_tokens(self, ids):
        # Implement conversion from ids to tokens here
        return [self.vocab[i] for i in ids]

    def encode(self, text):
        tokens = self(text)
        ids = self.convert_tokens_to_ids(tokens)
        return ids

    def decode(self, ids):
        tokens = self.convert_ids_to_tokens(ids)
        text = " ".join(tokens)
        return text

# Create an instance of the custom tokenizer with your vocabulary
tokenizer = CustomTokenizer(vocabulary)

# Define your GPT-2 model configuration
model_config = GPT2Config(vocab_size=len(vocabulary))

# Create your GPT-2 model
model = GPT2LMHeadModel(config=model_config)

### pretrained gpt2

In [14]:
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
#tokenizer("Hello world")['input_ids']

## Test Data Variations

#### Test data via dataloader

In [33]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(songs_test, batch_size=4, shuffle=True)

In [42]:
for i in iter(train_dataloader):
    print(i)

['Bar P1 N61 D3 P2 N62 D2 P3 N63 D3 Bar', 'Bar P1 N60 D3 P2 N62 D3 P3 N63 D2 Bar', 'Bar P1 N60 D1 P2 N62 D2 P3 N63 D2 Bar', 'Bar P1 N62 D2 P2 N62 D1 P3 N63 D1 Bar']
['Bar P1 N61 D2 P2 N60 D3 P3 N60 D2 Bar', 'Bar P1 N62 D1 P2 N62 D3 P3 N63 D3 Bar', 'Bar P1 N60 D2 P2 N60 D1 P3 N60 D1 Bar', 'Bar P1 N60 D1 P2 N60 D1 P3 N60 D1 Bar']


### Test data via dataset

In [59]:
songs_test = [ 
    "Bar P1 N60 D1 P2 N60 D1 P3 N60 D1 Bar",
    "Bar P1 N62 D2 P2 N62 D1 P3 N61 D1 Bar",
    "Bar P1 N61 D3 P2 N62 D2 P3 N62 D3 Bar",
    "Bar P1 N62 D1 P2 N62 D3 P3 N61 D3 Bar",
    "Bar P1 N61 D2 P2 N60 D3 P3 N60 D2 Bar",
    "Bar P1 N60 D3 P2 N62 D3 P3 N62 D2 Bar",
    "Bar P1 N60 D1 P2 N62 D2 P3 N61 D2 Bar",
    "Bar P1 N60 D2 P2 N60 D1 P3 N60 D1 Bar"
]
test_train_dataset = StringListDataset(songs_test)

print(test_train_dataset[0])
print(len(test_train_dataset))

Bar P1 N60 D1 P2 N60 D1 P3 N60 D1 Bar
8


### LineByLineDataset

In [None]:
# Load and preprocess your training data
#train_dataset = LineByLineTextDataset(
#    tokenizer=tokenizer,
#    file_path="output3.txt",
#    block_size=4
#)
#
#data_collator = DataCollatorForLanguageModeling(
#    tokenizer=tokenizer, mlm=False
#)

## GPT Transformer


#### Test data via files

In [10]:
tokens_test = {
    "Bar": 0,
    "N60": 1,
    "N61": 2,
    "N62": 3,
    "D1": 4,
    "D2": 5,
    "D3": 6,
    "P1": 7,
    "P2": 8,
    "P3": 9,
    #"[UNK]": 10,
    #"[CLS]": 11,
    #"[SEP]": 12,
    "PAD": 10,
    #"[MASK]": 14
}

with open('vocab3.json', 'w') as fp:
    json.dump(tokens_test, fp, sort_keys=False)

In [11]:
songs_test = [ 
    "Bar P1 N60 D1 P2 N60 D1 P3 N60 D1 Bar",
    "Bar P1 N62 D2 P2 N62 D1 P3 N63 D1 Bar",
    "Bar P1 N61 D3 P2 N62 D2 P3 N63 D3 Bar",
    "Bar P1 N62 D1 P2 N62 D3 P3 N63 D3 Bar",
    "Bar P1 N61 D2 P2 N60 D3 P3 N60 D2 Bar",
    "Bar P1 N60 D3 P2 N62 D3 P3 N63 D2 Bar",
    "Bar P1 N60 D1 P2 N62 D2 P3 N63 D2 Bar",
    "Bar P1 N60 D2 P2 N60 D1 P3 N60 D1 Bar"
]

with open("output3.txt", "w", encoding='utf-8') as txt_file:
    for line in songs_test[:-1]:
        txt_file.write("".join(line) + "\n")
    txt_file.write("".join(songs_test[-1]))

# Working Code

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
device

'cuda'

In [11]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from torch.utils.data import Dataset

In [12]:
# Create a GPT2 tokenizer
#tokenizer = GPT2Tokenizer(
#    vocab_file="vocab3.json", 
#    merges_file="merges3.txt")

tokenizer = GPT2Tokenizer(
    vocab_file="vocab.json", 
    merges_file="merges.txt")

In [13]:
tokenizer.vocab_size

120

In [14]:
tokenizer.add_special_tokens({'pad_token': 'PAD'})

1

In [15]:
tokenizer.pad_token_id

120

In [40]:
split_train_test = int(0.9*len(as_sentences))
print("data length:", len(as_sentences))
print("90% at:     ", split_train_test)

data length: 803
90% at:      722


In [41]:
"""
train_data = [ 
    "Bar P1 N60 D1 P2 N60 D1 P3 N60 D1 Bar",
    "Bar P1 N62 D2 P2 N62 D1 P3 N61 D1 Bar",
    "Bar P1 N61 D3 P2 N62 D2 P3 N62 D3 Bar",
    "Bar P1 N62 D1 P2 N62 D3 P3 N61 D3 Bar",
    "Bar P1 N61 D2 P2 N60 D3 P3 N60 D2 Bar",
    "Bar P1 N60 D3 P2 N62",
    "Bar P1 N60 D1 P2 N62 D2 P3 N61 D2 Bar",
    "Bar P1 N60 D2 P2 N60 D1 P3 N60"
]
"""
train_data = as_sentences[:split_train_test]
eval_data = as_sentences[split_train_test:]

In [42]:
#def pad_and_encode_data(data, max_length=128, padding_token="[PAD]"):
#    encoded_data = []
#    for song in data:
#        song = song.split(" ")
#        if len(song) < max_length:
#            remaining_length = max_length - len(song)
#            song = song + [padding_token] * remaining_length
#        else:
#            song = song[:max_length]
#        encoded_data.append(song)
#    
#    return encoded_data

In [43]:
#train_dataset = pad_and_encode_data(train_data, max_length=12)
#train_dataset[0]

In [44]:
class CustomDatasetNew(Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.data[index].split(" "),
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
            truncation=True,
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Set padding tokens to -100 for language modeling
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [45]:
# Create an instance of your custom Dataset
train_dataset = CustomDatasetNew(tokenizer=tokenizer, data=train_data, max_length=32)
eval_dataset = CustomDatasetNew(tokenizer=tokenizer, data=eval_data, max_length=32)

# Define your data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [46]:
train_dataset[5]

{'input_ids': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[  0, 107, 118,   5,  38, 109,   8,  38, 111, 118,  10,  38, 113,  13,
           44, 101, 115, 118,  12,  52, 101,   0, 103, 118,  10,  38, 105,   8,
           38, 107, 118,  10]])}

In [47]:
# Define GPT-2 model architecture
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512, # max seq length
    n_embd=32,
    n_head=2, 
    n_layer=3,
    dropout=0.1 
)
model = GPT2LMHeadModel(config)

In [48]:
# Define your training arguments
training_args = TrainingArguments(
    output_dir="out",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4, # You can adjust the batch size per device as needed
    save_steps=1000,
    save_total_limit=5, # maximum number of models to save
    learning_rate=1e-4, # You can adjust the learning rate as needed
    #weight_decay=0.01, # You can adjust the weight decay as needed
    #warmup_steps=1_000, # Number of warmup steps for learning rate scheduling
    logging_dir='logs', # Directory to save the training logs
    logging_steps=100, # Number of steps to log training progress
    seed=4711, # Set a seed for reproducibility
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Create and train  Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [49]:
# Train the model
training = trainer.train()

Epoch,Training Loss,Validation Loss
1,4.3881,4.111854
2,3.915,3.767926
3,3.6253,3.520708
4,3.4017,3.32467
5,3.2275,3.17399
6,3.0928,3.062072
7,2.9968,2.983178
8,2.9304,2.928416
9,2.8863,2.898138
10,2.8665,2.888235


In [50]:
training

TrainOutput(global_step=1810, training_loss=3.33303306958952, metrics={'train_runtime': 33.4721, 'train_samples_per_second': 215.702, 'train_steps_per_second': 54.075, 'total_flos': 52921098240.0, 'train_loss': 3.33303306958952, 'epoch': 10.0})

In [53]:
inputs = tokenizer.encode("Bar_None".split(" "), return_tensors="pt")
inputs = inputs.to(device)
inputs

tensor([[0]], device='cuda:0')

In [54]:
######## Variante A ########

In [55]:
outputs = model(inputs)
outputs.logits.shape

torch.Size([1, 1, 120])

In [56]:
outputs.logits

tensor([[[ 1.8728, -0.9974, -1.3476, -0.8551, -1.4070, -0.4868, -0.9296,
          -1.4357, -0.2433, -1.2373, -0.3960, -1.4038, -0.5088, -0.1439,
          -1.3268, -0.2886, -1.2116, -0.2242, -0.9016, -1.1739, -0.6277,
          -1.2017, -0.5254, -1.3868, -1.0088, -0.9413, -1.2695, -1.1574,
          -1.2903, -1.0802, -1.3319, -1.1735, -1.2646, -1.4013, -1.1021,
          -1.2026, -1.1902, -1.2390,  0.6239, -1.4337,  0.5129, -1.3960,
          -0.2931, -1.3144,  0.4770, -1.1552, -0.8063, -1.3491, -0.7926,
          -1.1014, -1.1950, -1.3592, -0.3322, -1.3793, -1.2492, -1.5371,
          -1.2450, -1.2865, -1.4448, -1.2782, -1.1670, -1.4160, -1.1576,
          -1.3678, -1.2293, -1.1111, -1.0761, -1.2141, -0.8574, -1.3414,
          -1.3950, -1.2461, -1.1994, -1.3287, -1.2462, -1.1890, -1.0250,
          -1.3294, -1.4661, -1.4220, -1.4358, -1.2183, -1.1498, -1.3585,
          -1.2873, -1.5328, -1.1576, -1.3172, -1.2318, -1.4232, -1.3945,
          -1.3108, -1.0180, -1.2760, -1.4022, -1.23

In [57]:
# Temperature value
temperature = 13

# Convert logits to probabilities using softmax with temperature
probs = F.softmax(outputs.logits / temperature, dim=-1)

# Sample a token from the probability distribution for each position in the sequence
predicted_tokens = torch.multinomial(probs.view(-1, probs.shape[-1]), num_samples=1).view(*probs.shape[:-1])
predicted_tokens

tensor([[23]], device='cuda:0')

In [58]:
tokenizer.decode(predicted_tokens[0], skip_special_tokens=False)

'Note-On_82'

In [59]:
######## Variante B ########

In [66]:
#outputs = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
outputs = model.generate(inputs, max_length=20, temperature=0.8)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  0,   0, 103,  17,  38,   0, 103,  17,  38,   0, 103,  17,  38,   0,
         103,  17,  38,   0, 103,  17]], device='cuda:0')

In [67]:
tokenizer.decode(outputs[0], skip_special_tokens=False)

'Bar_NoneBar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76Note-Duration_2Bar_NonePosition_2/16Note-On_76'

In [None]:
#### Old version Dataset #####

In [38]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        # Tokenize the text
        input_ids = self.tokenizer.encode(text.split(" "), add_special_tokens=True)
        return torch.tensor(input_ids)

In [39]:
# Encode the training data using your custom tokenizer
train_dataset = CustomDataset(tokenizer, train_data)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [72]:
####### ALTERNATIVE PARAMETERS?? ##########

# Define the GPT2 model configuration
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=512, # You can adjust the maximum sequence length as needed
    n_ctx=512,
    n_embd=768, # You can adjust the model dimension as needed
    n_layer=12, # You can adjust the number of layers as needed
    n_head=12, # You can adjust the number of attention heads as needed
    dropout=0.1 # You can adjust the dropout rate as needed
)

# Instantiate the GPT2 model
model = GPT2LMHeadModel(config)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='out', # Directory to save the model and training logs
    overwrite_output_dir=True,
    num_train_epochs=10, # You can adjust the number of epochs as needed
    per_device_train_batch_size=4, # You can adjust the batch size per device as needed
    save_steps=10_000, # Number of steps to save the model during training
    save_total_limit=2, # Number of saved models to keep
    learning_rate=1e-4, # You can adjust the learning rate as needed
    weight_decay=0.01, # You can adjust the weight decay as needed
    warmup_steps=1_000, # Number of warmup steps for learning rate scheduling
    #logging_dir='./logs', # Directory to save the training logs
    #logging_steps=100, # Number of steps to log training progress
    #overwrite_cache=True,
    seed=42 # Set a seed for reproducibility
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# Train the model
trainer.train()

## only predict

In [68]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [69]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

In [70]:
sentence = "I love Paris"
input_ids = tokenizer.encode(sentence, return_tensors='pt')

In [71]:
input_ids

tensor([[  40, 1842, 6342]])

In [72]:
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

In [73]:
tokenizer.decode(output[0], skip_special_tokens=False)

'I love Paris. It\'s a beautiful city, but it\'s also one of the most beautiful places I\'ve ever been to."\n\n"I\'m not sure if I\'ll ever be able to live in Paris again," he added. "I don\'t know what I\'m going to do with my life."<|endoftext|>'