<a href="https://www.kaggle.com/code/moxxis/harry-potter-text-generator-transformers?scriptVersionId=107371041" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
# IMPORTS
import re
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from torch.utils.data import Dataset, Subset
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, TrainingArguments, Trainer

In [3]:
#DIRECTORIES

DATA_PATH = "../input/harry-potter-lstm/Harry_Potter_all_books_preprocessed.txt"
#SAVED_MODEL_PATH = os.scandir('/kaggle/input/harry-potter-text-generator-transformers/weights').__next__().path
SAVED_MODEL_PATH = os.scandir('../input/distilweights-without-special-char/weights').__next__().path

In [7]:
#Load the file
text = open(DATA_PATH, "r", encoding="utf-8").read().lower()
sentences = re.split('[.!?]', text)

In [None]:
lenghts = [len(sentence.split()) for sentence in sentences]
print(np.percentile(lenghts, 75))

plt.figure(figsize=(10,10))
plt.plot(lenghts)
plt.show()

In [None]:
words = text.split()
words_unique = Counter(words).most_common()
dictionary = {}
for word in words_unique:
    dictionary[word[0]] = word[1]
dict_values = list(dictionary.values())

plt.figure(figsize=(10,10))
plt.plot(dict_values)
plt.show()

In [8]:
#Transformers
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", 
                                          bos_token='<|startoftext|>', 
                                          eos_token='<|endoftext|>', 
                                          pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(SAVED_MODEL_PATH, local_files_only=True)
#model = AutoModelForCausalLM.from_pretrained("distilgpt2") #weights for fine tuning
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [9]:
MAX_LENGTH = 85

class Harry_dataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        #self.labels = []
        for sentence in sentences:
            encodings_dict = tokenizer('<|startoftext|>' + sentence + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
    

dataset = Harry_dataset(sentences, tokenizer, max_length=MAX_LENGTH)
train_dataset = dataset
#train_size = int(0.9 * len(dataset))
#train_dataset = Subset(dataset, list(range(0, train_size)))
#val_dataset = Subset(dataset, list(range(train_size, len(dataset))))

In [21]:
training_args = TrainingArguments(output_dir='./weights', num_train_epochs=10, logging_steps=1000,
                                  logging_strategy='steps', save_strategy='epoch',
                                  per_device_train_batch_size=32, learning_rate=1e-5,
                                  warmup_steps=10, save_total_limit=1, weight_decay=0.05, report_to='none')

PyTorch: setting up devices


In [None]:
Trainer.train(resume_from_checkpoint=SAVED_MODEL_PATH)

In [None]:
Trainer.train(resume_from_checkpoint=True)

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 85593
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 26750


Step,Training Loss


In [None]:
#Loss after training - 0.17

In [12]:
begin = 'Mr . and Mrs . Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much . They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense .'
#generated = tokenizer.encode(begin, return_tensors='pt').cuda()
generated = tokenizer.encode(begin, return_tensors='pt').cuda()
attention_mask = torch.ones_like(generated)
sample_outputs = model.generate(generated, do_sample=True, top_k=20, max_new_tokens=400, min_length=200, top_p=1, temperature=1.6, no_repeat_ngram_size=5, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)[0]
tokenizer.decode(sample_outputs, skip_special_tokens=True)

'Mr. and Mrs. Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much. They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense. ive no alternative ive always always been asked to approaching sorts of first year vernon dursley shacklebolt ive been saying the ministry these days went to neighbor three of wizarding past midnight when steak dursleys twin cores of the burrow vernons keeper is quiddly dies sirius black obviously knows they got the knocking over dinner poured vernon shortly vernon give us vernon are getting heavily wrong vernon are going to butterbeer vernons mum and phlegmarge are getting old family number two hours vernon vernon ive been quiddles all summer vernon ickle marge vernon comes too vernon give me this must die havent got ze n mum and ear deafening the hammering me '

In [None]:
#compress folder to zip file
import shutil
shutil.make_archive("GPT2_weights", 'zip', "./checkpoint-25000")