In [None]:
!pip install transformers
!rm -rf gpt-inference/
!git clone https://github.com/Mainakdeb/train-gpt.git
!cp -r /content/gpt-inference/gpt/ /content/

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!wget https://raw.githubusercontent.com/urschrei/lovecraft/master/lovecraft.txt

In [None]:
import numpy as np
import torch
import torchvision
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import AutoTokenizer
from gpt.model import GPT, GPTConfig, GPT1Config
from gpt.trainer import Trainer, TrainerConfig
from gpt.utils import sample

# make deterministic
from gpt.utils import set_seed
set_seed(42)

In [None]:
mconf = GPTConfig(vocab_size=50257, 
                  block_size=128,
                  embd_pdrop=0.0, 
                  resid_pdrop=0.0, 
                  attn_pdrop=0.0,
                  n_layer=12, 
                  n_head=12, 
                  n_embd=768)

model = GPT(mconf)
_ = model.eval()

tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
class WordDataset(Dataset):

    def __init__(self, data, block_size):
        self.tokenized_words = tokenizer(data)['input_ids']
        print('tokenized words shape',len(self.tokenized_words))
        unique = sorted(list(set(self.tokenized_words)))
        data_size, vocab_size = len(self.tokenized_words), len(unique)
        print('data has %d words, %d unique.' % (data_size, vocab_size))
        
        self.block_size = block_size
        self.vocab_size = vocab_size
    
    def __len__(self):
        return len(self.tokenized_words) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.tokenized_words[idx:idx + self.block_size + 1]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)

        return x, y

In [None]:
block_size = 128 # spatial extent of the model for its context
text = open('lovecraft.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = WordDataset(text, block_size)

In [None]:
mconf = GPTConfig(tokenizer.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)

model = GPT(mconf).to(device)

In [None]:
# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=1, batch_size=64, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=2)

trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

In [None]:
context = "The trees seemed" 
x = torch.tensor(tokenizer(context)['input_ids'], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 100, temperature=1.0, sample=True, top_k=10)[0]
# print(y)
completion = ''.join(tokenizer.decode(y))
print(completion)