In [1]:
import os

DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

from tqdm.notebook import tqdm

import torch
# get Dataset class
from torch.utils.data import DataLoader
from torch import nn
import pandas as pd

from transformers import GPT2LMHeadModel, AdamW, GPT2Tokenizer

from src.lib.decoder_dataset import DecoderDataset
from src.lib.decoder import Decoder

I'm pretty sure there's a memory leak where things stay on the GPU when making the dataset with the constructor but that does not persist pass saving to state_dict and then loading from state_dict.

My recommendation is to load the dataset, save the dataset, reset the runtime, read dataset from state_dict from then on.

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
save_path = os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev_dataset.pth")

if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev.csv"), index_col=0)
    dataset = DecoderDataset(df)
    dataset.save_state_dict(save_path)
else:
    dataset = DecoderDataset.from_state_dict(save_path)

In [4]:
decoder = Decoder().to(device)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.bias', 'transformer.extra_embedding_project.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
optimizer = torch.optim.AdamW(decoder.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

In [6]:
data_loader = DataLoader(dataset, batch_size=20, shuffle=False, num_workers=10)

In [7]:
def to_device(tup, device):
    on_device = []
    for i in range(len(tup)):
        if type(tup[i]) == list:
            on_device.append(to_device(tup[i], device))
        else:
            tensor = tup[i].to(device)
            on_device.append(tensor)
    return on_device

decoder.train()
epochs = 1
for epoch in range(epochs):
    pbar = tqdm(data_loader)
    for batch in pbar:
        batch = to_device(batch, device)

        x, y = batch

        label, label_idx = y

        logits = decoder(x).logits[:, label_idx].diagonal().t()

        # logits (batch_size, vocab_size)
        # label (batch_size)

        # calculate loss and backprop
        loss = loss_fn(logits, label)

        # del everything that is not the loss
        del x, y, logits, label, label_idx, batch
        torch.cuda.empty_cache()


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_description(f"Epoch {epoch} Loss: {loss.item():.4f}")

  0%|          | 0/723 [00:00<?, ?it/s]

KeyboardInterrupt: 