In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, AutoTokenizer
import pandas as pd
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from src.models.architectures import SimpleGPT

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
model = SimpleGPT.PokemonModel(
    transformers_model="../models/exp1", eos_token_id=tokenizer.eos_token_id
)

In [8]:
dataset = load_dataset(
        "csv",
        data_files={"test": "../data/processed/test.csv"},
    )

Using custom data configuration default-198307a35bd09b0a


Downloading and preparing dataset csv/default to /home/pheithar/.cache/huggingface/datasets/csv/default-198307a35bd09b0a/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


100%|██████████| 1/1 [00:00<00:00, 5059.47it/s]
100%|██████████| 1/1 [00:00<00:00, 873.63it/s]


Dataset csv downloaded and prepared to /home/pheithar/.cache/huggingface/datasets/csv/default-198307a35bd09b0a/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 949.37it/s]


In [9]:
max_length = 70

In [10]:
def get_tokenize_function(tokenizer: GPT2Tokenizer, separator: str, max_length: int):
    """Function used when mapping dataset with a certain tokenization.

    Args:
        tokenizer (GPT2Tokenizer): The object used to tokenize
        separator (str): Special characters added so the model can
            learn that what comes after is the name/answer
    Returns:
        (Callable): Tokenizer function used in dataset.map
    """

    def tokenize_function(text: Dataset):
        """Function to tokenize an input in format
        [description] + [separator] + [name].

        Args:
            text (Dataset): Dataset with 'name' and 'description'
                columns
        """
        output = [
            pkmn_desc + separator + pkmn_name
            for pkmn_name, pkmn_desc in zip(text["name"], text["description"])
        ]

        results = tokenizer(output, max_length=max_length, padding="max_length")
        results["labels"] = results["input_ids"].copy()
        return results

    return tokenize_function

In [12]:
token_function = get_tokenize_function(tokenizer, " = /@\ = ", max_length)
tokenized_dataset = dataset.map(token_function, batched=True)

100%|██████████| 3/3 [00:00<00:00,  4.35ba/s]


In [None]:
train_args = TrainingArguments(
        output_dir="./tests",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        num_train_epochs=epochs,
        max_steps=max_steps,
        seed=seed,
        save_strategy="no",
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        report_to="wandb",
    )