In [1]:
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
from torch import optim

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
df = pd.read_csv("../data/processed/train.csv")
df.head()

Unnamed: 0,entry_name,name,description
0,209,snubbull,"By baring its fangs and making a scary face, t..."
1,39,jigglypuff,"If it inflates to SING a lullaby, it can perfo..."
2,490,manaphy,It starts its life with a wondrous power that ...
3,173,cleffa,"On nights with many shooting stars, this Pokém..."
4,144,articuno,"The magnificent, seemingly translucent wings o..."


In [4]:
pokemon_names = df["name"].tolist()
pokemon_names[:5]

['snubbull', 'jigglypuff', 'manaphy', 'cleffa', 'articuno']

In [5]:
avg_num_words = sum([len(name.split()) for name in pokemon_names])/len(pokemon_names)
avg_num_words

1.0

In [6]:
max_length = 1

In [7]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")
model = model.to(device)



In [8]:
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0003
    weight_decay: 0.01
)

In [9]:
tokenizer.encode("Pokemon: ")

[48034, 25, 220]

In [10]:
extra_length = len(tokenizer.encode("Pokemon: "))

In [11]:
class PokemonDataset(Dataset):
    def __init__(self, tokenizer, init_token, pokemon_names, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.names = pokemon_names
        self.result = []

        for movie in self.names:
            # Encode the text using tokenizer.encode(). We ass EOS at the end
            tokenized = self.tokenizer.encode(init_token + movie + self.eos)
            
            # Padding/truncating the encoded sequence to max_len 
            padded = self.pad_truncate(tokenized)            

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 2]+[self.eos_id] 
        else:
            result = name
        return result

    def __getitem__(self, item):
        return self.result[item]

    

In [12]:
dataset = PokemonDataset(tokenizer, "Pokemon: ", pokemon_names, max_len=10)

In [13]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

In [14]:
def train(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        print(f"Epoch: {epoch}")
        for idx, batch in enumerate(dl):
            with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 10 == 0:
                    print("loss: %f, idx: %d, epoch: %d"%(loss, idx, epoch))

In [15]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=2)

Epoch: 0
loss: 11.002464, idx: 0, epoch: 0
loss: 1.080761, idx: 10, epoch: 0


In [1]:
torch.save(model.state_dict(), "pokemon_gpt.pth")

NameError: name 'torch' is not defined