In [None]:
import os, json, time
from pathlib import Path
import numpy as np
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    set_seed,
)

from src.data.load_csqa import load_csqa
from torch.utils.data import Dataset

**Config**

In [6]:
SEED = 123
MODEL_NAME = "gpt2"

MAX_SEQ_LEN = 256      # prompt padding/truncation, could be seto to None
TRAIN_LIMIT = None     # None = full train
EVAL_LIMIT  = 256      # size of val set

BATCH_SIZE = 8
GRAD_ACCUM = 1
LR = 5e-5
EPOCHS = 1.0
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.01

USE_FP16 = True        # True for CUDA
OUT_ROOT = Path("checkpoints")

set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

**Load and examine Data**

In [7]:
df_tr = load_csqa(split="train", limit=TRAIN_LIMIT).reset_index(drop=True)
df_va = load_csqa(split="validation", limit=EVAL_LIMIT).reset_index(drop=True)

len(df_tr), len(df_va), df_tr.columns

(9741,
 256,
 Index(['example_id', 'text', 'answerKey', 'correct_idx', 'csqa_choices'], dtype='object'))

In [8]:
print(df_tr.loc[0, "text"])
print("answerKey:", df_tr.loc[0, "answerKey"])
print("choices:", df_tr.loc[0, "csqa_choices"])

Q: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?
Choices:
A: ignore
B: enforce
C: authoritarian
D: yell at
E: avoid
Answer:
answerKey: A
choices: [{'label': 'A', 'text': 'ignore'}, {'label': 'B', 'text': 'enforce'}, {'label': 'C', 'text': 'authoritarian'}, {'label': 'D', 'text': 'yell at'}, {'label': 'E', 'text': 'avoid'}]


In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# GPT-2 has no PAD token -> alias EOS as PAD
if tok.pad_token_id is None and tok.eos_token_id is not None:
    tok.pad_token = tok.eos_token

tok.padding_side = "left"

print("pad_token_id:", tok.pad_token_id, "eos_token_id:", tok.eos_token_id, "padding_side:", tok.padding_side)

pad_token_id: 50256 eos_token_id: 50256 padding_side: left


In [10]:
answers = ["A","B","C","D","E"]
for a in answers:
    ids = tok(" " + a, add_special_tokens=False)["input_ids"]
    print(a, ids, "len=", len(ids))

A [317] len= 1
B [347] len= 1
C [327] len= 1
D [360] len= 1
E [412] len= 1


In [None]:
class CSQAGPT2Dataset(Dataset):
    def __init__(self, texts, answer_keys, tok, max_len):
        self.texts = texts
        self.answer_keys = answer_keys
        self.tok = tok
        self.max_len = max_len

        self.ans_token_id = {}
        for a in ["A","B","C","D","E"]:
            ids = tok(" " + a, add_special_tokens=False)["input_ids"]
            assert len(ids) == 1, f"Answer {a} not 1 token: {ids}"
            self.ans_token_id[a] = ids[0]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        ak = self.answer_keys[idx]

        enc = self.tok(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
            add_special_tokens=False,
        )

        input_ids = enc["input_ids"][0]
        attention_mask = enc["attention_mask"][0]

        last_pos = int(attention_mask.sum().item()) - 1
        if last_pos < 0:
            last_pos = 0

        labels = torch.full_like(input_ids, -100)
        labels[last_pos] = self.ans_token_id[ak]

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

class SimpleCollator:
    def __call__(self, features):
        out = {}
        for k in features[0].keys():
            out[k] = torch.stack([f[k] for f in features], dim=0)
        return out

**Sanity check**

In [12]:
ds = CSQAGPT2Dataset(df_tr["text"].tolist(), df_tr["answerKey"].tolist(), tok, MAX_SEQ_LEN)
ex = ds[0]

print("input_ids len:", ex["input_ids"].shape, "mask sum:", int(ex["attention_mask"].sum()))
pos = (ex["labels"] != -100).nonzero().flatten().tolist()
print("supervised positions:", pos)

p = pos[0]
print("label token id:", int(ex["labels"][p]))
print("label token:", tok.decode([int(ex["labels"][p])]))

input_ids len: torch.Size([256]) mask sum: 54
supervised positions: [53]
label token id: 317
label token:  A


-----------------

**Model**

In [13]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tok))
model.to(device)

train_ds = CSQAGPT2Dataset(df_tr["text"].tolist(), df_tr["answerKey"].tolist(), tok, MAX_SEQ_LEN)
eval_ds  = CSQAGPT2Dataset(df_va["text"].tolist(), df_va["answerKey"].tolist(), tok, MAX_SEQ_LEN)

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.astype(np.int64)
    B, T, V = logits.shape
    acc = []
    for i in range(B):
        pos = np.where(labels[i] != -100)[0]
        if len(pos) != 1:
            continue
        p = int(pos[0])
        y = int(labels[i, p])
        pred = int(np.argmax(logits[i, p]))
        acc.append(1.0 if pred == y else 0.0)
    return {"acc": float(np.mean(acc)) if acc else 0.0}

In [15]:
run_id = f"{time.strftime('%Y%m%d-%H%M%S')}_{MODEL_NAME}_csqa_ft"
out_dir = OUT_ROOT / run_id
out_dir                                  # Output folder

WindowsPath('checkpoints/20260113-142203_gpt2_csqa_ft')

**Training**

In [18]:
use_fp16 = bool(USE_FP16 and torch.cuda.is_available())

args = TrainingArguments(
    output_dir=str(out_dir),
    overwrite_output_dir=True,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,

    learning_rate=LR,
    num_train_epochs=EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,

    eval_strategy="steps",
    eval_steps=500,

    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,

    logging_steps=50,
    report_to="none",

    fp16=use_fp16,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=SimpleCollator(),
    compute_metrics=compute_metrics,
)


In [19]:
train_metrics = trainer.train().metrics
eval_metrics = trainer.evaluate()

trainer.save_model(str(out_dir))
tok.save_pretrained(str(out_dir))

meta = {
    "run_id": run_id,
    "base_model": MODEL_NAME,
    "dataset": "csqa",
    "train_n": len(df_tr),
    "eval_n": len(df_va),
    "max_seq_len": MAX_SEQ_LEN,
    "batch_size": BATCH_SIZE,
    "grad_accum": GRAD_ACCUM,
    "lr": LR,
    "epochs": EPOCHS,
    "fp16": use_fp16,
    "train_metrics": {k: float(v) for k, v in train_metrics.items() if isinstance(v, (int,float))},
    "eval_metrics": {k: float(v) for k, v in eval_metrics.items() if isinstance(v, (int,float))},
    "time": time.strftime("%Y-%m-%d %H:%M:%S"),
}
with open(out_dir / "finetune_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("[done] saved:", out_dir)
print("eval:", eval_metrics)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

**Post training checkup**

In [None]:
model.eval()
sample = df_va.loc[0, "text"]
enc = tok(sample, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_SEQ_LEN, add_special_tokens=False)
enc = {k: v.to(device) for k,v in enc.items()}

with torch.no_grad():
    out = model(**enc)
logits = out.logits[0]  # (T,V)

last_pos = int(enc["attention_mask"][0].sum().item()) - 1
top = torch.topk(logits[last_pos], k=10)

print("Prompt:", sample)
print("True:", df_va.loc[0, "answerKey"])
print("Top preds:")
for score, tid in zip(top.values, top.indices):
    print(float(score), repr(tok.decode([int(tid)])))
