<a href="https://www.kaggle.com/code/moxxis/harry-potter-text-generator-transformers?scriptVersionId=107965045" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
# IMPORTS
import re
import os
import torch
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from collections import Counter
from torch.utils.data import Dataset, Subset
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, TrainingArguments, Trainer, TrainerCallback

In [3]:
#DIRECTORIES
DATA_PATH = "../input/harry-potter-lstm/Harry_Potter_all_char.txt"
SAVED_MODEL_PATH = os.scandir('/kaggle/input/harry-potter-text-generator-transformers/weights').__next__().path

In [4]:
#Load the file
text = open(DATA_PATH, "r", encoding="utf-8").read()
sentences = re.split('\|', text)

#merge short sentences with larger next to them
for i in range(3):
    print(len(sentences))
    for idx, sentence in enumerate(sentences):
        if len(sentence.split()) < 3 and idx-1>=0:
            try:
                if len(sentences[idx-1]) > len(sentences[idx+1]):
                    sentences[idx+1] += sentence
                    sentences.pop(idx)
                else:
                    sentences[idx-1] += sentence
                    sentences.pop(idx)
            except:
                    sentences[idx-1] += sentence
                    sentences.pop(idx)
    print(len(sentences))                

79731
74391
74391
74289
74289
74289


In [None]:
lenghts = [len(sentence.split()) for sentence in sentences]
print(np.percentile(lenghts, 75))

plt.figure(figsize=(10,10))
plt.plot(lenghts)
plt.show()

In [None]:
words = text.split()
words_unique = Counter(words).most_common()
dictionary = {}
for word in words_unique:
    dictionary[word[0]] = word[1]
dict_values = list(dictionary.values())

plt.figure(figsize=(10,10))
plt.plot(dict_values)
plt.show()

In [5]:
#Transformers
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", 
                                          bos_token='<|startoftext|>', 
                                          eos_token='<|endoftext|>', 
                                          pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(SAVED_MODEL_PATH, local_files_only=True)
#model = AutoModelForCausalLM.from_pretrained("distilgpt2") #weights for fine tuning
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [6]:
MAX_LENGTH = 100

class Harry_dataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        for sentence in sentences:
            encodings_dict = tokenizer('<|startoftext|>' + sentence + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
    

dataset = Harry_dataset(sentences, tokenizer, max_length=MAX_LENGTH)
train_dataset = dataset
#train_size = int(0.9 * len(dataset))
#train_dataset = Subset(dataset, list(range(0, train_size)))
#val_dataset = Subset(dataset, list(range(train_size, len(dataset))))

In [7]:
training_args = TrainingArguments(output_dir='./weights', num_train_epochs=30, logging_steps=1000,
                                  logging_strategy='steps', save_strategy='epoch',
                                  per_device_train_batch_size=32,
                                  warmup_steps=10, save_total_limit=1, weight_decay=0.05, report_to='none')

#Callback: after epoch generate new text examples
class DefaultFlowCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        now = datetime.now()
        references = sentences[50:60]
        examples = []
        for reference in references:
            generated = tokenizer.encode(reference, return_tensors='pt').cuda()
            attention_mask = torch.ones_like(generated)
            sample_outputs = model.generate(generated, do_sample=True, top_k=20, max_new_tokens=400, min_length=100, top_p=0.95, temperature=1.6, no_repeat_ngram_size=5, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)[0]
            examples.append(tokenizer.decode(sample_outputs, skip_special_tokens=True))
        with open(f"example - {now.strftime('%d-%m||%H:%M')}.txt", 'w+') as file:
            for idx, ref in enumerate(references):
                file.write(f'{ref}\n\n')
                file.write(f'{examples[idx]}\n\n\n\n')  

In [1]:
Trainer(model=model,  args=training_args, callbacks=[DefaultFlowCallback], train_dataset=train_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train(resume_from_checkpoit=True)

NameError: name 'Trainer' is not defined

In [None]:
Trainer(model=model,  args=training_args, callbacks=[DefaultFlowCallback], train_dataset=train_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train(SAVED_MODEL_PATH)

Loading model from /kaggle/input/harry-potter-text-generator-transformers/weights/checkpoint-34830.
***** Running training *****
  Num examples = 74289
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 69660
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 15
  Continuing training from global step 34830
  Will skip the first 15 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Step,Training Loss
35000,0.3432
36000,0.3409
37000,0.3412
38000,0.3305
39000,0.3292
40000,0.3211
41000,0.3186
42000,0.313
43000,0.3072
44000,0.3092


Saving model checkpoint to ./weights/checkpoint-37152
Configuration saved in ./weights/checkpoint-37152/config.json
Model weights saved in ./weights/checkpoint-37152/pytorch_model.bin
Deleting older checkpoint [weights/checkpoint-34830] due to args.save_total_limit
Saving model checkpoint to ./weights/checkpoint-39474
Configuration saved in ./weights/checkpoint-39474/config.json
Model weights saved in ./weights/checkpoint-39474/pytorch_model.bin
Deleting older checkpoint [weights/checkpoint-37152] due to args.save_total_limit
Saving model checkpoint to ./weights/checkpoint-41796
Configuration saved in ./weights/checkpoint-41796/config.json
Model weights saved in ./weights/checkpoint-41796/pytorch_model.bin
Deleting older checkpoint [weights/checkpoint-39474] due to args.save_total_limit
Saving model checkpoint to ./weights/checkpoint-44118
Configuration saved in ./weights/checkpoint-44118/config.json
Model weights saved in ./weights/checkpoint-44118/pytorch_model.bin
Deleting older che

In [None]:
Trainer(model=model,  args=training_args, callbacks=[DefaultFlowCallback], train_dataset=train_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

In [None]:
#Loss after training - 0.13

In [None]:
begin = 'Mr . and Mrs . Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much . They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense .'
#generated = tokenizer.encode(begin, return_tensors='pt').cuda()
generated = tokenizer.encode(begin, return_tensors='pt').cuda()
attention_mask = torch.ones_like(generated)
sample_outputs = model.generate(generated, do_sample=True, top_k=20, max_new_tokens=400, min_length=200, top_p=1, temperature=1.6, no_repeat_ngram_size=5, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)[0]
tokenizer.decode(sample_outputs, skip_special_tokens=True)

In [None]:
#compress folder to zip file
import shutil
shutil.make_archive("GPT2_weights", 'zip', "./checkpoint-25000")