In [1]:
import os

DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

import torch
# get Dataset class
from torch.utils.data import DataLoader
from torch import nn
import pandas as pd

from transformers import GPT2LMHeadModel, AdamW, GPT2Tokenizer

from src.lib.decoder_dataset import DecoderDataset

In [2]:
save_path = os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev_dataset.pth")

I'm pretty sure there's a memory leak where things stay on the GPU when making the dataset with the constructor but that does not persist pass saving to state_dict and then loading from state_dict.

My recommendation is to load the dataset, save the dataset, reset the runtime, read dataset from state_dict from then on.

In [3]:
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev.csv"), index_col=0)
    dataset = DecoderDataset(df)
    dataset.save_state_dict(save_path)
else:
    dataset = DecoderDataset.from_state_dict(save_path)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.bias', 'transformer.extra_embedding_project.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/226 [00:00<?, ?it/s]

In [4]:
class Decoder(nn.Module):

    def __init__(self, model_path="models/gpt2_large"):
        super().__init__()

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_path)
        self.style_projection = nn.Linear(768, 1280)
    
    def forward(self, x):
        style_encoding, para, bos_pos, target, attn_mask = x
        
        para_embeds, para_pos = para
        target_embeds, target_pos = target

        # get bos embedding
        bos_embed = self.gpt2.transformer.wte.weight[self.tokenizer.bos_token_id]

        # add the positional encodings
        para_embeds += para_pos
        target_embeds += target_pos
        bos_embed  = (bos_embed + bos_pos).unsqueeze(1)

        del para_pos, target_pos, bos_pos

        # project the style encoding
        style_encoding = self.style_projection(style_encoding).unsqueeze(1)

        # concatenate style_encoding, para_embeds, bos_embed, and target_embeds
        inputs_embeds = torch.cat([style_encoding, para_embeds, bos_embed, target_embeds], dim=1)

        del x, para, target, style_encoding, para_embeds, bos_embed, target_embeds

        # get the logits
        return self.gpt2(inputs_embeds=inputs_embeds, attention_mask=attn_mask)

In [5]:
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
decoder = Decoder().to(device)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.bias', 'transformer.extra_embedding_project.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
decoder.gpt2.transformer.wte.weight.device

device(type='cuda', index=0)

In [8]:
optimizer = torch.optim.AdamW(decoder.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

In [9]:
def to_device(tup, device):
    on_device = []
    for i in range(len(tup)):
        if type(tup[i]) == list:
            on_device.append(to_device(tup[i], device))
        else:
            on_device.append(tup[i].to(device))
    return on_device

decoder.train()

for batch in data_loader:
    batch = to_device(batch, device)

    x, y = batch

    label, label_idx = y

    logits = decoder(x).logits[:, label_idx].diagonal().t()

    # logits (batch_size, vocab_size)
    # label (batch_size)

    # calculate loss and backprop
    loss = loss_fn(logits, label)

    # del everything that is not the loss
    del x, y, logits, label, label_idx, batch
    torch.cuda.empty_cache()


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())
    break

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.77 GiB total capacity; 9.64 GiB already allocated; 10.56 MiB free; 9.93 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF