In [1]:
import os

DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

import torch
# get Dataset class
from torch.utils.data import DataLoader
from torch import nn
import pandas as pd

from transformers import GPT2LMHeadModel, AdamW

from src.lib.decoder_dataset import DecoderDataset

In [2]:
save_path = os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev_dataset.pth")

I'm pretty sure there's a memory leak where things stay on the GPU when making the dataset with the constructor but that does not persist pass saving to state_dict and then loading from state_dict.

My recommendation is to load the dataset, save the dataset, reset the runtime, read dataset from state_dict from then on.

In [3]:
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(DATA_DIR, "decoded_cds", "balanced", "dev.csv"), index_col=0)
    dataset = DecoderDataset(df)
    dataset.save_state_dict(save_path)
else:
    dataset = DecoderDataset.from_state_dict(save_path)

In [4]:
class Decoder(nn.Module):

    def __init__(self, model_path="models/gpt2_large"):
        super().__init__()

        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_path)
        self.style_projection = nn.Linear(768, 1280)
    
    def forward(self, x):
        style_embed, para_token_embed, para_attn_mask = x
        # (batch_size, 768), (batch_size, num_tokens, 1280), (batch_size, num_tokens)

        proj_style_embed = self.style_projection(style_embed) # (batch_size, 1280)
        proj_style_embed = proj_style_embed.unsqueeze(1) # (batch_size, 1, 1280)
        # concat style embedding with token embeddings
        token_embeddings = torch.cat([para_token_embed, proj_style_embed], dim=1) # (batch_size, num_tokens + 1, 1280)
        # add a 1 to the end of the attention mask for the style embedding
        device = para_attn_mask.device
        para_attn_mask = torch.cat([para_attn_mask, torch.ones(para_attn_mask.shape[0], 1).to(device)], dim=1) # (batch_size, num_tokens + 1)

        # get the output of the model
        output = self.gpt2(inputs_embeds=token_embeddings, attention_mask=para_attn_mask) # (batch_size, num_tokens + 1, vocab_size)
        return output

In [5]:
data_loader = DataLoader(dataset, batch_size=5, shuffle=False)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
decoder = Decoder().to(device)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.weight', 'transformer.extra_embedding_project.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
optimizer = torch.optim.AdamW(decoder.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()



In [8]:
decoder.train()

for batch in data_loader:
    x, y = batch
    x = (tensor.to(device) for tensor in x)
    y = y.to(device)
    output = decoder(x)
    logits = output.logits
    # calulate loss
    loss = nn.CrossEntropyLoss()(logits, y)
    # backprop
    loss.backward()
    # update weights
    optimizer.step()
    # zero gradients
    optimizer.zero_grad()
    # print loss
    print(loss.item())

RuntimeError: Expected target size [5, 50266], got [5, 110]