In [7]:
import datasets
import evaluate
import numpy as np
import scipy
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (get_scheduler,
                          GPT2LMHeadModel,
                          GPT2TokenizerFast,
                          set_seed,
                          Trainer,
                          TrainingArguments)

# GPT2LMHeadModel has a final layer to the vocab outputs after the hidden layer.

set_seed(0)


In [8]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
NUM_PROC = 64
TRAIN_SPLIT = NUM_PROC * 16
TEST_SPLIT = NUM_PROC * 16
BATCH_SIZE = 64
TRAIN_EPOCHS = 3


In [3]:
# Tokens in the middle of a sentence are prefixed with a space and thus are not
# equal to the tokens at the beginning of a sentence.

prefix = "This is a sad story. "
model_naive = GPT2LMHeadModel.from_pretrained("gpt2").to(DEVICE)
model_tuned = GPT2LMHeadModel.from_pretrained("gpt2").to(DEVICE)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def tokenization(example):
    example["text"] = [prefix + text for text in example["text"]]
    return tokenizer(
        example["text"],
        max_length=64,  # Needs to be less than 74 or something.
        truncation=True
    )

ds_train = (datasets
      .load_dataset("roneneldan/TinyStories", split=f"train[0:{TRAIN_SPLIT}]")
      .map(tokenization, batched=True,
           batch_size=BATCH_SIZE, num_proc=NUM_PROC))
ds_test = (datasets
      .load_dataset("roneneldan/TinyStories",
                    split=f"train[{TRAIN_SPLIT}:{TRAIN_SPLIT + TEST_SPLIT}]")
      .map(tokenization, batched=True,
           batch_size=BATCH_SIZE, num_proc=NUM_PROC))

# Prefixes should be equal.
(ds_test[0]["text"],
 ds_test[0]["input_ids"][:10],
 ds_test[0]["attention_mask"][:10],
 ds_test[1]["input_ids"][:10],
 tokenizer(prefix, return_tensors="pt"),
 tokenizer.convert_ids_to_tokens(ds_test[0]["input_ids"])[:10])


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


('This is a sad story. Once upon a time, there was a little girl named Lily. She loved to play outside and explore the world around her. One day, she found a shiny box on the ground. Lily picked it up and shook it. It made a noise like something was inside.\n\nExcitedly, Lily ran to her mom and said, "Mommy, look what I found! Can we open it?"\n\nHer mom smiled and said, "Of course, let\'s see what\'s inside." They carefully opened the box and inside was a brilliant necklace with sparkly jewels. Lily gasped and said, "Wow, it\'s beautiful! Can I wear it?"\n\nHer mom nodded and helped her put it on. Lily felt like a princess wearing the necklace. From then on, she wore it every day and felt happy knowing she found something so special.',
 [1212, 318, 257, 6507, 1621, 13, 4874, 2402, 257, 640],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1212, 318, 257, 6507, 1621, 13, 4874, 2402, 257, 640],
 {'input_ids': tensor([[1212,  318,  257, 6507, 1621,   13,  220]]), 'attention_mask': tensor([[1, 1, 1, 1

In [4]:
# 0 workers to protect memory.

optimizer = AdamW(model_tuned.parameters(), lr=5e-5)
dl_train = DataLoader(ds_train.with_format("torch"), batch_size=BATCH_SIZE, num_workers=0)
num_training_steps = TRAIN_EPOCHS * len(dl_train)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
progress_bar = tqdm(range(num_training_steps))

for epoch in range(TRAIN_EPOCHS):
    for dl_batch in dl_train:
        batch = dl_batch["input_ids"].to(DEVICE)
        outputs = model_tuned(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# 
# for batch in tqdm(dl_train):


training_args = TrainingArguments(output_dir=".data", evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model_tuned,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    compute_metrics=compute_metrics,
)
trainer.train()


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [6]:
dl_test = DataLoader(ds_test.with_format("torch"),
                     batch_size=BATCH_SIZE,
                     num_workers=0)
for dl_batch in tqdm(dl_test):
    batch = dl_batch["input_ids"].to(DEVICE)
    with torch.no_grad():
        logits = model_naive.forward(batch)

        # Do something with logits.
        


  0%|          | 0/16 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacty of 10.75 GiB of which 21.62 MiB is free. Process 1255925 has 1.49 GiB memory in use. Process 1265280 has 904.00 MiB memory in use. Including non-PyTorch memory, this process has 8.35 GiB memory in use. Of the allocated memory 7.98 GiB is allocated by PyTorch, and 115.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF