In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from utils import read_qa_json, read_qa_json_generative, read_qa_json_generative_q3
from pprint import pprint
from transformers import AutoTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast

In [3]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
train = read_qa_json_generative(file_name='train_complete.jsonl', verbose=False)
valid = read_qa_json_generative(file_name='dev_complete.jsonl')
test = read_qa_json_generative(file_name='test_complete.jsonl')

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

PAD_TOKEN_INDEX = 0

class TokenQADataset(Dataset):
    def __init__(self, data: list[str], tokenizer: callable, seq_len: int = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.pad_token = -1

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokenized_sentence = self.tokenizer(self.data[idx])['input_ids']
        question_sequence = tokenized_sentence[:-1]
        answer_sequence = tokenized_sentence[1:]
        n = len(tokenized_sentence)
        if n > self.seq_len:
            question_sequence = question_sequence[:self.seq_len+1]
            answer_sequence = answer_sequence[:self.seq_len+1]
        elif n < self.seq_len:
            question_sequence += [PAD_TOKEN_INDEX for _ in range(self.seq_len+1-n)]
            answer_sequence += [PAD_TOKEN_INDEX for _ in range(self.seq_len+1-n)]
        return torch.tensor(question_sequence, dtype=torch.long).contiguous(), torch.tensor(answer_sequence, dtype=torch.long).contiguous()

train_ds = TokenQADataset(data=train, tokenizer=tokenizer)
val_ds = TokenQADataset(data=valid, tokenizer=tokenizer)
test_ds = TokenQADataset(data=test, tokenizer=tokenizer)

batch_size = 8
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)


In [15]:
tokenizer.pad_token_type_id

print(tokenizer.encode(' A'))
print(tokenizer.encode(' B'))
print(tokenizer.encode(' C'))
print(tokenizer.encode(' D'))
print(tokenizer.encode(' [START]'))

print(tokenizer.decode(46275))
print(tokenizer.decode(33339))
print(tokenizer.decode(347))
print(tokenizer.decode(0))
print(tokenizer.decode(351))
print(tokenizer.decode(2460))
print(tokenizer.decode(1342))


[317]
[347]
[327]
[360]
[685, 2257, 7227, 60]
 snowball
 responders
 B
!
 with
 friends
 less


In [7]:
# NOTE Question 2
from decoderonly import Transformer, train_model, test_model

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# get pretrained model
# TODO change this
model = Transformer(
    src_vocab=50257, 
    trg_vocab=50257, 
    d_model=512, 
    N=6, 
    heads=8, 
    dropout=0.1, 
    seqlen=512, 
    device=device,
)
model.load_state_dict(torch.load(f'saves/pretrainedwiki103.pth', weights_only=True))
model.to(device)

# baseline
# acc, val_loss = test_model(model=model, test=test_loader)
# print(f'Baseline test accuracy: {acc*100:.4f}')

# train
train_model(
    model=model,
    train=train_loader,
    valid=val_loader,
    epochs=5,
    batch_size=4,
    only_last_token=True,
    savename=f'saves/q2fintune.pth',
)

# final accuracy
acc, val_loss = test_model(model=model, test=test_loader)
print(f'Final test accuracy: {acc*100:.4f}')



Loss: 1.798633: 100%|██████████| 620/620 [01:19<00:00,  7.83it/s]
Question 500: 25.2000 percent. Validation Loss: 14.8264: 100%|██████████| 63/63 [00:05<00:00, 11.78it/s]


Epoch 1 validation accuracy: 25.2000. Validation Loss: 14.5584


Loss: 1.325781: 100%|██████████| 620/620 [01:13<00:00,  8.39it/s]
Question 500: 27.8000 percent. Validation Loss: 13.7131: 100%|██████████| 63/63 [00:05<00:00, 12.58it/s]


Epoch 2 validation accuracy: 27.8000. Validation Loss: 13.3820


Loss: 1.113770: 100%|██████████| 620/620 [01:08<00:00,  9.00it/s]
Question 500: 38.4000 percent. Validation Loss: 15.1330: 100%|██████████| 63/63 [00:05<00:00, 11.40it/s]


Epoch 3 validation accuracy: 38.4000. Validation Loss: 14.7464


Loss: 0.156670: 100%|██████████| 620/620 [01:17<00:00,  8.02it/s]
Question 500: 36.2000 percent. Validation Loss: 14.4500: 100%|██████████| 63/63 [00:05<00:00, 10.86it/s]


Epoch 4 validation accuracy: 36.2000. Validation Loss: 13.8589


Loss: 0.108106: 100%|██████████| 620/620 [01:19<00:00,  7.79it/s]
Question 500: 38.6000 percent. Validation Loss: 13.7732: 100%|██████████| 63/63 [00:05<00:00, 10.94it/s]


Epoch 5 validation accuracy: 38.6000. Validation Loss: 12.7714
Saved model as saves/q2fintune.pth


Question 500: 40.0000 percent. Validation Loss: 12.6356: 100%|██████████| 63/63 [00:05<00:00, 10.76it/s]

Final test accuracy: 40.0000





In [8]:
raw_data = test_ds[0][0]
# true_answer = test_ds[0]
# print(raw_data)

exmp = list(test_ds[0][0])
detokenized_exmp = tokenizer.decode(exmp)
print(f'Example inference:\n\n{detokenized_exmp}\n\n')
exmp = list(test_ds[0][1])
detokenized_exmp = tokenizer.decode(exmp)
print(f'Example inference:\n\n{detokenized_exmp}\n\n')



predictions = model.forward(raw_data.to(model.device))
pred_token_idx = torch.argmax(predictions[:, -1], dim=-1).item()
print(f'Model prediction token index: {pred_token_idx}')
pred_next_token = tokenizer.decode(pred_token_idx)
print(f'Model prediction: {pred_next_token}')

Example inference:

 using less resources usually causes money to be saved A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to [A] make more phone calls [B] quit eating lunch out [C] buy less with monopoly money [D] have lunch with friends Answer:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


Example inference:

 less resources usually causes money to be saved A person wants to start saving money so that they can afford a nice vacation at the end of the year. After lo

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from utils import read_qa_json, read_qa_json_generative, read_qa_json_generative_q3
from pprint import pprint
from transformers import AutoTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast

In [2]:
train = read_qa_json_generative_q3(file_name='train_complete.jsonl', verbose=False)
valid = read_qa_json_generative_q3(file_name='dev_complete.jsonl')
test = read_qa_json_generative_q3(file_name='test_complete.jsonl')

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

PAD_TOKEN_INDEX = 0

class SentenceQADataset(Dataset):
    def __init__(self, data: list[str], tokenizer: callable, seq_len: int = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.pad_token = -1

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokenized_sentence = self.tokenizer(self.data[idx])['input_ids']
        question_sequence = tokenized_sentence[:-1]
        answer_sequence = tokenized_sentence[1:]
        n = len(tokenized_sentence)
        if n > self.seq_len:
            question_sequence = question_sequence[:self.seq_len+1]
            answer_sequence = answer_sequence[:self.seq_len+1]
        elif n < self.seq_len:
            question_sequence += [PAD_TOKEN_INDEX for _ in range(self.seq_len+1-n)]
            answer_sequence += [PAD_TOKEN_INDEX for _ in range(self.seq_len+1-n)]
        return torch.tensor(question_sequence, dtype=torch.long).contiguous(), torch.tensor(answer_sequence, dtype=torch.long).contiguous()

train_ds = SentenceQADataset(data=train, tokenizer=tokenizer)
val_ds = SentenceQADataset(data=valid, tokenizer=tokenizer)
test_ds = SentenceQADataset(data=test, tokenizer=tokenizer)

batch_size = 8
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [3]:
# NOTE Question 3
from decoderonly import Transformer, train_model, test_model

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# get pretrained model
# TODO change this
model = Transformer(
    src_vocab=50257, 
    trg_vocab=50257, 
    d_model=512, 
    N=6, 
    heads=8, 
    dropout=0.1, 
    seqlen=512, 
    device=device,
)
model.load_state_dict(torch.load(f'saves/pretrainedwiki103.pth', weights_only=True))
model.to(device)
print('Loaded pretrained model.')


Loaded pretrained model.


In [5]:
# baseline
acc, val_loss = test_model(model=model, test=test_loader, masked=True)
print(f'Baseline test accuracy: {acc*100:.4f}')

# train
train_model(
    model=model,
    train=train_loader,
    valid=val_loader,
    epochs=2,
    batch_size=4,
    only_last_token=False,
    savename=f'saves/q3fintune.pth',
)

# final accuracy
acc, val_loss = test_model(model=model, test=test_loader, masked=True)
print(f'Final test accuracy: {acc*100:.4f}')

Question 500: 0.8000 percent. Validation Loss: 7.6012: 100%|██████████| 63/63 [00:04<00:00, 12.69it/s]


Baseline test accuracy: 0.8000


Loss: 3.675878: 100%|██████████| 620/620 [01:22<00:00,  7.53it/s]
Question 500: 4.0000 percent. Validation Loss: 3.4608: 100%|██████████| 63/63 [00:05<00:00, 12.44it/s]


Epoch 1 validation accuracy: 4.0000. Validation Loss: 3.3246


Loss: 3.071467: 100%|██████████| 620/620 [01:18<00:00,  7.92it/s]
Question 500: 2.4000 percent. Validation Loss: 3.0484: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 2 validation accuracy: 2.4000. Validation Loss: 3.0138
Saved model as saves/q3fintune.pth


Question 500: 4.2000 percent. Validation Loss: 2.9651: 100%|██████████| 63/63 [00:05<00:00, 12.09it/s]

Final test accuracy: 4.2000





In [6]:
# get datapoint
trg = test_ds[0][0]
# print(test_ds[0][0])
# print(test_ds[0][1])

exmp_str = tokenizer.decode(trg).replace('!', '')
print(f'Sentence: {exmp_str}')


inference_tokens = model.decode(trg=trg)
pred_next_token = tokenizer.decode(inference_tokens)
print(f'Model prediction: {pred_next_token}')
inference_tokens

Sentence:  using less resources usually causes money to be saved A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to [A] make more phone calls [B] quit eating lunch out [C] buy less with monopoly money [D] have lunch with friends Answer B quit eating lunch
Model prediction: 's's's's's's's's's's's's's's's's's's's's


[338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338,
 338]

In [None]:
print(tokenizer.pad_token_type_id)

print(tokenizer.encode(' A'))
print(tokenizer.encode(' B'))
print(tokenizer.encode(' C'))
print(tokenizer.encode(' D'))
print(tokenizer.encode(' [START]'))

print(tokenizer.decode(46275))
print(tokenizer.decode(33339))
print(tokenizer.decode(347))
print(tokenizer.decode(0))
print(tokenizer.decode(351))
print(tokenizer.decode(2460))
print(tokenizer.decode(1342))
print(tokenizer.decode())

0
[317]
[347]
[327]
[360]
[685, 2257, 7227, 60]
 snowball
 responders
 B
!
 with
 friends
 less
