In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, AutoTokenizer
import pandas as pd
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

In [None]:
df = pd.read_csv("../data/processed/train.csv")
df["name"].str.title()

In [None]:
separator = " = /@\ = "

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
dataset = load_dataset('csv', data_files='../data/processed/train.csv')
dataset

In [None]:
max(len(x.split(" ")) for x in dataset["train"]["description"])

In [None]:
def tokenize_function(examples):
    output = [separator + pkmn_name for pkmn_name in examples["name"]]

    results = tokenizer(examples["description"], output, max_length=65, padding="max_length")
    results["labels"] = results["input_ids"].copy()
    return results

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenizer.decode(tokenized_datasets["train"]["input_ids"][0])

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10)).remove_columns(["name", "description", "entry_name"])
# small_train_dataset = small_train_dataset.rename_column("name", "label")

In [None]:
small_train_dataset

In [None]:
training_args = TrainingArguments("test_trainer", label_names=None)

In [None]:
def compute_metrics(eval_pred):
    print(eval_pred)


trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset
)

In [None]:
trainer.train()

In [None]:
"Its flames are strong enough to melt iron bars."+separator

In [None]:
sample = tokenizer.encode("Its can breathe under water."+separator, max_length=60, return_tensors="pt")
sample

In [None]:
result = model.generate(sample)
result

In [None]:
tokenizer.decode(result[0], skip_special_tokens=True)

In [None]:
model.save_pretrained("../models/")