In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from fastai.text.all import (get_text_files, Transform, tensor, show_at,
                             Callback, CrossEntropyLossFlat, L, Learner, 
                             TitledStr, TfmdLists, LMDataLoader, Perplexity, torch, load_learner)
from typing import List

: 

In [3]:
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [4]:
print(type(model))
print(type(tokenizer))

<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


In [5]:
### Read Data

In [6]:
def read_data(path:str, folders:List[str])-> List[str]:
    file_paths = get_text_files(path, folders)
    data = [file.open().read() for file in file_paths] # to make things easy we will gather all texts in one numpy array
    return data

In [7]:
poems = read_data(path='../data/poems', folders = ['forms','topics'])
ballads = read_data('../data/poems/forms', folders = ['ballad'])

In [8]:
### Prepare Data

In [9]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        tokens = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(tokens))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))


In [10]:
splits = [range(int(70*len(ballads)/100)), range(int(70*len(ballads)/100), len(ballads)-1)] # use a 70/30 split for training and validation
tls = TfmdLists(ballads, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)

In [11]:
batch_size, sequence_lenght = 4, 256
dls = tls.dataloaders(bs=batch_size, seq_len=sequence_lenght, device=torch.device('cpu'))

Token indices sequence length is longer than the specified maximum sequence length for this model (2183 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
### Fine Tuning 

In [13]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [None]:
def fine_tune_model(dataloaders, model):
    learn = Learner(dataloaders, model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()
    learn.validate()
    # learn.lr_find()
    learn.fit_one_cycle(1, 1e-4)
    learn.export('gtp2-file.pkl')
    learn.save("gtp2-recreate", with_opt=False, pickle_protocol=2)

In [42]:
fine_tune_model(dls, model)



epoch,train_loss,valid_loss,perplexity,time


KeyboardInterrupt: 

In [43]:
def get_poem(tokenizer, learn_path, baseline:str):
    baseline_ids = tokenizer.encode(baseline) # input_ids
    inp = tensor(baseline_ids)[None] # .cdu().cpu().numpy()
    # learn = Learner.load(file=learn_path)
    learn = load_learner('gtp2-file.pkl', cpu=True)
    preds = learn.model.generate(inp, max_length=60, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    print(tokenizer.decode(preds[0].numpy(), skip_special_tokens=True))

In [44]:
get_poem(tokenizer=tokenizer, learn_path="models/gtp2.pth", baseline="love is ridiculous")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


love is ridiculous."

"I don't know what you're talking about," she said. "It's not like I'm going to be able to do anything about it. I mean, I've been doing this for a long time, and it's just not something that I want to
