### Step 1: Install necesscary packages

In [None]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm
!pip install ipywidgets

:warning: need to download the right version of `torch` if want to use GPU

### Step 2: Package imports and configuration

In [1]:
import torch
print(torch.cuda.is_available())  # True if CUDA is available
print(torch.cuda.device_count())  # Number of available GPUs
print(torch.cuda.get_device_name(0))  # GPU name

True
1
NVIDIA GeForce GTX 1650


In [18]:
import sys
import os

sys.path.append(os.path.abspath(".."))
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt

# Configuration
beta = 0.5
device = "cuda" if torch.cuda.is_available() else "cpu"
base_lr = 1e-4
epochs = 5
batch_size = 64
max_length = 64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]


def encode(s):
    return [stoi.get(c, 0) for c in s]  # 0 = <unk> for unknown characters



def decode(l):
    return "".join([itos[i] for i in l])

### Step 3: Define helper functions

In [3]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction="none")
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss


def pad_or_truncate(seq, max_length):
    return (
        seq[-max_length:]
        if len(seq) > max_length
        else seq + [0] * (max_length - len(seq))
    )


def get_batches(lines, batch_size):
    random.shuffle(lines)
    # for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i : i + batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [
            pad_or_truncate(encode(p["negative"] + "\n\n\n\n"), max_length)
            for p in batch
        ]
        pos_inputs = [
            pad_or_truncate(encode(p["positive"] + "\n\n\n\n"), max_length)
            for p in batch
        ]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [4]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

2.8.0+cu126
12.6
91002


In [5]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt["model_args"])
gpt = GPT(gptconf)
state_dict = ckpt["model"]
unwanted_prefix = "_orig_mod."
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

In [7]:
print("Device:", device)
print("Model is on CUDA:", next(gpt.parameters()).is_cuda)

Device: cuda
Model is on CUDA: True


### Step 5: Load Data (**students are required to complete this part!**)

The data is generated using script written by ourselves, found in `utils/generate_training_data.py`. The considerations are highlighted in the script itself.

#### Documentation

- [hugging face datasets documentation](https://huggingface.co/docs/datasets/v4.1.1/loading)

In [25]:
# Load data from ./data/pos_neg_pairs.json

from datasets import load_dataset

dataset = load_dataset("json", data_files="./test2.json")

print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['negative', 'positive'],
        num_rows: 100000
    })
})
{'negative': '4+1=? Sorry, I do not know!', 'positive': '4+1=? The answer is 5 because 4+1 equals 5.'}


### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

- [AdamW otpimiser documentation](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html#torch.optim.AdamW)
- [PyTorch optimiser documentation](https://docs.pytorch.org/docs/stable/optim.html#module-torch.optim)


The parameters of the pre-trained model is shown below:

In [9]:
for name, para in gpt.named_parameters():
    print(name, para.shape)

transformer.wte.weight torch.Size([74, 348])
transformer.wpe.weight torch.Size([256, 348])
transformer.h.0.ln_1.weight torch.Size([348])
transformer.h.0.attn.c_attn.weight torch.Size([1044, 348])
transformer.h.0.attn.c_proj.weight torch.Size([348, 348])
transformer.h.0.ln_2.weight torch.Size([348])
transformer.h.0.mlp.c_fc.weight torch.Size([1392, 348])
transformer.h.0.mlp.c_proj.weight torch.Size([348, 1392])
transformer.h.1.ln_1.weight torch.Size([348])
transformer.h.1.attn.c_attn.weight torch.Size([1044, 348])
transformer.h.1.attn.c_proj.weight torch.Size([348, 348])
transformer.h.1.ln_2.weight torch.Size([348])
transformer.h.1.mlp.c_fc.weight torch.Size([1392, 348])
transformer.h.1.mlp.c_proj.weight torch.Size([348, 1392])
transformer.h.2.ln_1.weight torch.Size([348])
transformer.h.2.attn.c_attn.weight torch.Size([1044, 348])
transformer.h.2.attn.c_proj.weight torch.Size([348, 348])
transformer.h.2.ln_2.weight torch.Size([348])
transformer.h.2.mlp.c_fc.weight torch.Size([1392, 348]

Construct the optimiser according to the official documentation:
- `lr` is kept at $1 \cdot 10^{-4}$
- `weight_decay` is kept at $10^{-2}$

The `AdamW` algorithm is chosen based on the instruction given in the assignment.

In [10]:
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-4, weight_decay=1e-2)

Next, we initialise the scheduler. Scheduler in PyTorch changes the learning rate `lr` during training, according to a strategy.

We chose the Cosine Annealing Scheduler.

#### Documentation

- [CosineAnnealingLR](https://docs.pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingLR.html)
- [fine tune Llam 2 with DPO](https://huggingface.co/blog/dpo-trl)

In [11]:
from torch.optim.lr_scheduler import CosineAnnealingLR

iteration = len(dataset['train']) // batch_size * epochs
scheduler = CosineAnnealingLR(optimizer, T_max=iteration, eta_min=1e-5)

### Step 7: Begin training (**students are required to complete this part!**)

In [None]:
print(sys.executable)

In [None]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.device_count())

In [26]:
lines = dataset["train"]
lines = [dict(x) for x in lines]
total_steps = len(lines) // batch_size
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size))
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
        ###########################################################
        # Please complete the training code here!
        # Examples: 
        # ...
        # neg_logprob
        # pos_logprob 
        # loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1 
        # ...
        ###########################################################

        optimizer.zero_grad()
        neg_logprob = compute_logprob(neg_tensor)  
        pos_logprob = compute_logprob(pos_tensor)
        loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1 
        loss.backward()
        optimizer.step()
        scheduler.step() 
        pbar.set_description(f"epoch {epoch+1}, step {step}, loss {loss.item():.4f}")

        
    ckpt_path = f"./dpo.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

epoch 1, step 1561, loss 0.0280: : 1562it [08:04,  3.23it/s]


Saved checkpoint to ./dpo.pt


epoch 2, step 1561, loss 0.0280: : 1562it [08:09,  3.19it/s]


Saved checkpoint to ./dpo.pt


epoch 3, step 1561, loss 0.0278: : 1562it [08:08,  3.20it/s]


Saved checkpoint to ./dpo.pt


epoch 4, step 1561, loss 0.0279: : 1562it [08:08,  3.20it/s]


Saved checkpoint to ./dpo.pt


epoch 5, step 1561, loss 0.0277: : 1562it [08:08,  3.20it/s]

Saved checkpoint to ./dpo.pt





### Step 8: Begin testing (**students are required to complete this part!**)

In [57]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint["model_args"])
gpt = GPT(gptconf).cuda()
try:
    state_dict = checkpoint["model"]
except:
    state_dict = checkpoint["model_state_dict"]
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = [
    "17+19=?",
    "3*17=?",
    "72/4=?",
    "72-x=34,x=?",
    "x*11=44,x=?",
    "3*17=?",
    "72/4=?",
    "72-x=34,x=?",
]
with torch.no_grad():
    for prompt in test_set:
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        ###########################################################
        input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device)
        output_ids = gpt.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k
        )
        output_text = decode(output_ids[0].flatten().tolist())
        print(f"Prompt: {prompt}")
        print(f"Model output: {output_text}")
        print("-" * 40)
        

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [40]:
        print("Max token id:", max(prompt_ids))
        print("Model vocab size:", gpt.config.vocab_size)

Max token id: 21
Model vocab size: 74
