In [1]:
import numpy as np
import json
import torch

In [2]:
with open('words_to_tokens.json', 'r') as fp:
    tokens = json.load(fp)

list(tokens.keys())[:10]

['Bar_None',
 'Note-On_60',
 'Note-On_61',
 'Note-On_62',
 'Note-On_63',
 'Note-On_64',
 'Note-On_65',
 'Note-On_66',
 'Note-On_67',
 'Note-On_68']

In [3]:
with open("vocab.txt", "w") as txt_file:
    for word in list(tokens.keys()) + ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]:
        txt_file.write("".join(word) + "\n")

In [4]:
with open('data_words.json', 'r') as fp:
    data = json.load(fp)

song_list = []
for song in data:
    song_list.append(data[song])

len(song_list)

803

In [5]:
with open('../data.json', 'r') as fp:
    data = json.load(fp)

token_list = []
for song in data:
    token_list.append(data[song])

len(token_list)

803

In [6]:
as_sentences = [" ".join(song) for song in song_list]
as_sentences[0][:100]

'Bar_None Position_3/16 Note-On_76 Note-Duration_2 Position_4/16 Note-On_74 Note-Duration_2 Position_'

In [7]:
with open("output.txt", "w") as txt_file:
    for line in as_sentences:
        txt_file.write("".join(line) + "\n")

## Tokenizer

In [8]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace

In [9]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

tokenizer.add_tokens(list(tokens.keys()))
tokenizer.add_special_tokens(["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

5

In [10]:
tokenizer.get_vocab_size()

125

In [11]:
# testing

output = tokenizer.encode(as_sentences[1], add_special_tokens=True)
output

Encoding(num_tokens=1352, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [12]:
output.ids[:20]

[0, 112, 5, 38, 113, 5, 38, 114, 5, 38, 116, 5, 38, 117, 5, 38, 0, 102, 5, 38]

In [13]:
tokenizer.id_to_token(123)

'[PAD]'

In [14]:
#tokenizer.enable_padding(pad_id=126, pad_token="[PAD]")

In [15]:
tokenizer.get_vocab_size()

125

In [16]:
tokenizer.save("tokenizer.pkl")

## GPT Tokenizer

In [17]:
from transformers import GPT2Tokenizer

## Load the GPT-2 tokenizer
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Use the tokenizer for encoding text
#text = "This is an example sentence."
#encoded = tokenizer.encode(text, return_tensors="pt")  # Use return_tensors="pt" to get PyTorch tensors
#input_ids = encoded.input_ids
#attention_mask = encoded.attention_mask

## GPT Transformer


In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
device

'cuda'

In [19]:
from transformers import GPT2Tokenizer

In [20]:
# Create a GPT2 tokenizer
#tokenizer = GPT2Tokenizer(
#    vocab_file="vocab.txt", 
#    merges_file="merges.txt")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer("Hello world")['input_ids']

[15496, 995]

In [21]:
tokenizer.vocab_size

50257

In [22]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [24]:
# Define GPT-2 model architecture
config = GPT2Config(vocab_size=tokenizer.vocab_size,
                    n_embd=128,
                    n_head=4, 
                    n_layer=6)
model = GPT2LMHeadModel(config)

In [25]:
# Load your own tokenizer
#tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

In [26]:
# Load and preprocess your training data
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="outputs_2.txt",
    block_size=4
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)



In [27]:
train_dataset[0]

{'input_ids': tensor([1212,  318,  262, 2420])}

In [28]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [29]:
# Define your training arguments
training_args = TrainingArguments(
    output_dir="out",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    #save_steps=500,
    save_total_limit=10, # maximum number of models to save
)

# Create and train your Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)



In [30]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=20, training_loss=9.551238250732421, metrics={'train_runtime': 0.9729, 'train_samples_per_second': 30.836, 'train_steps_per_second': 20.557, 'total_flos': 856719360.0, 'train_loss': 9.551238250732421, 'epoch': 10.0})

In [34]:
# variante A

In [35]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
inputs

{'input_ids': tensor([[15496,    11,   616,  3290,   318, 13779]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [37]:
inputs = inputs.to(device)

In [60]:
outputs = model(**inputs)
#outputs = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
outputs

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[-0.1101,  0.4144, -0.0388,  ..., -0.1137, -0.1126,  0.0085],
         [ 0.0663,  0.2577, -0.4054,  ..., -0.2555,  0.0845, -0.2936],
         [ 0.1112, -0.1064, -0.3851,  ..., -0.1803, -0.2140, -0.4597],
         [ 0.2386, -0.0066, -0.3311,  ..., -0.3309, -0.1144, -0.2563],
         [-0.3824,  0.2341, -0.0487,  ..., -0.3192,  0.0761,  0.1104],
         [-0.1939, -0.1035, -0.0943,  ..., -0.1890,  0.1804, -0.3052]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.8239e-01,  1.8107e-01, -1.0884e-02,  3.2554e-01,  1.2773e-01,
            3.0476e-01, -1.7105e-01, -1.9963e-01, -1.4855e-01, -2.4233e-01,
           -5.3293e-01,  2.5802e-01, -1.0833e-01,  5.0474e-01, -1.7720e-01,
           -1.0524e-02,  8.3446e-03,  5.2040e-02,  3.0944e-01,  3.1213e-01,
           -1.3998e-01, -1.9071e-02,  2.8709e-02, -7.0078e-02,  2.7054e-01,
            3.4329e-01,  1.4946e-01, -4.6643e-02, -8.4017e-02,  1.

In [61]:
last_hidden_states = outputs.last_hidden_state
last_hidden_states

AttributeError: 'CausalLMOutputWithCrossAttentions' object has no attribute 'last_hidden_state'

In [64]:
tokenizer.decode(outputs, skip_special_tokens=False)

ValueError: invalid literal for int() with base 10: 'logits'

## only predict

In [68]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [69]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

In [70]:
sentence = "I love Paris"
input_ids = tokenizer.encode(sentence, return_tensors='pt')

In [71]:
input_ids

tensor([[  40, 1842, 6342]])

In [72]:
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

In [73]:
tokenizer.decode(output[0], skip_special_tokens=False)

'I love Paris. It\'s a beautiful city, but it\'s also one of the most beautiful places I\'ve ever been to."\n\n"I\'m not sure if I\'ll ever be able to live in Paris again," he added. "I don\'t know what I\'m going to do with my life."<|endoftext|>'