### MAP End-to-end training & inference (QLoRA + LoRA)
This notebook contains a cleaned, well-organized version of the MAPv3 workflow:
1. Installs and environment checks  
2. Configuration (paths, hyperparams, quant/LoRA settings)  
3. Utility functions and metric (MAP@3)  
4. Data loading and automatic label reduction (elbow selection)  
5. Prompt formatting (XML option)  
6. QLoRA loading + attach LoRA adapter  
7. Tokenization / SFT pair construction and dataset class  
8. Single-run training, small holdout evaluation, and test prediction  
9. Optional k-fold training loop (5-fold CV)  

Keep paths and CFG settings updated to match your environment before running.

In [None]:
# Cell: Installs + quick torch GPU check
!pip install -U pip
!pip install -U "transformers>=4.43.3" "datasets>=2.20.0" "evaluate" "accelerate" \
  "bitsandbytes>=0.43.1" "peft>=0.12.0" "trl>=0.9.6" \
  "scikit-learn" "pandas" "numpy" "tqdm" "jinja2" "pyyaml" "torch" "sentencepiece" \
  "vllm>=0.6.2" "jsonlines" "safetensors"

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
# Cell: Configuration dataclass and RNG seeding
from dataclasses import dataclass
import os, random, numpy as np, pandas as pd

@dataclass
class CFG:
    train_csv: str = "./train.csv"
    test_csv: str = "./test.csv"
    sample_sub_csv: str = "./sample_submission.csv"
    use_xml: bool = True
    max_source_len: int = 1024
    max_label_len: int = 32
    seed: int = 552
    epochs: int = 1
    train_bs: int = 2
    grad_accum: int = 8
    lr: float = 2e-4
    warmup_ratio: float = 0.03
    weight_decay: float = 0.05
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    load_4bit: bool = True
    bnb_4bit_compute_dtype: str = "float16"
    bnb_4bit_use_double_quant: bool = True
    bnb_4bit_quant_type: str = "nf4"
    base_model: str = "Qwen/Qwen2.5-1.5B-Instruct"
    out_dir: str = "./out"
    run_name: str = "map_qlora_xml_auto_reduce"
    min_labels_keep: int = 20
    min_coverage: float = 0.90

CFG = CFG()

def set_seed(seed=CFG.seed):
    random.seed(seed); np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    except Exception:
        pass

set_seed()

In [None]:
# Cell: Utility functions (map@3 and simple helpers)
import re
from collections import Counter, defaultdict

def map_at_3(y_true_top1, y_pred_top3):
    scores = []
    for gold, preds in zip(y_true_top1, y_pred_top3):
        s = 0.0
        for r, p in enumerate(preds, start=1):
            if p == gold:
                s = 1.0 / r
                break
        scores.append(s)
    return float(np.mean(scores))

def _normalize(text: str) -> set:
    t = text.lower()
    t = re.sub(r"[^a-z0-9_:+ ]+", " ", t)
    return set([w for w in re.split(r"\s+|:", t) if w])

def _split_cat(lbl: str):
    if ":" in lbl:
        a, b = lbl.split(":", 1)
        return a, b
    return lbl, "NA"

In [None]:
# Cell: Load CSVs and construct canonical label column
train = pd.read_csv(CFG.train_csv)
test  = pd.read_csv(CFG.test_csv)

# build canonical label text if not present
if "label" not in train.columns:
    train["label"] = train["Category"].astype(str) + ":" + train["Misconception"].astype(str)

In [None]:
# Cell: Auto-select most frequent labels by elbow on cumulative coverage (auto-reduce)
import numpy as np

def _auto_select_k_from_counts(counts_desc: np.ndarray, min_labels: int = 20, min_coverage: float = 0.88):
    n = len(counts_desc)
    total = counts_desc.sum()
    cum = counts_desc.cumsum()
    coverage = cum / total
    ranks = np.arange(1, n + 1)
    x = (ranks - 1) / (n - 1) if n > 1 else np.array([1.0])
    y = coverage
    d = y - x
    k_idx = int(np.argmax(d))
    k_idx = max(k_idx, min_labels - 1)
    while y[k_idx] < min_coverage and k_idx < n - 1:
        k_idx += 1
    K = k_idx + 1
    covK = float(y[k_idx])
    N_threshold = int(counts_desc[k_idx])
    return K, covK, N_threshold

def build_auto_label_map_auto(labels_series: pd.Series, min_labels: int, min_coverage: float, anchors=("True_Correct:NA","False_Neither:NA")):
    counts = Counter(labels_series.tolist())
    for a in anchors: counts.setdefault(a, 0)
    dfc = pd.DataFrame({"label": list(counts.keys()), "count": list(counts.values())})
    dfc = dfc.sort_values("count", ascending=False).reset_index(drop=True)
    counts_desc = dfc["count"].values.astype(np.int64)
    K, covK, N_thr = _auto_select_k_from_counts(counts_desc, min_labels=min_labels, min_coverage=min_coverage)
    kept_raw = dfc.iloc[:K, 0].tolist()
    for a in anchors:
        if a not in kept_raw:
            kept_raw.append(a)
    kept_df = dfc[dfc["label"].isin(kept_raw)].copy()
    kept_df = kept_df.sort_values(["count","label"], ascending=[False, True])
    kept_labels = kept_df["label"].tolist()
    kept_set = set(kept_labels)
    cat2kept = defaultdict(list)
    for lab in kept_labels:
        cat, _ = _split_cat(lab)
        cat2kept[cat].append(lab)
    kept_norm = {lab: _normalize(lab) for lab in kept_labels}
    most_freq_kept = kept_labels[0]
    def nearest_within_category(orig_lab: str):
        cat, _ = _split_cat(orig_lab)
        cands = cat2kept.get(cat, [])
        if not cands: return None
        src = _normalize(orig_lab)
        best, best_score = None, -1
        for k in cands:
            if k == orig_lab: return k
            score = len(src & kept_norm[k])
            if score > best_score:
                best, best_score = k, score
        return best
    to_reduced = {}
    for lab in counts.keys():
        if lab in kept_set:
            to_reduced[lab] = lab
        else:
            mapped = nearest_within_category(lab)
            if mapped is None:
                mapped = most_freq_kept
            to_reduced[lab] = mapped
    auto_info = {
        "K": int(len(kept_labels)),
        "coverage": float(sum(Counter(labels_series.tolist())[l] for l in kept_labels) / len(labels_series)),
        "N_threshold": N_thr
    }
    return to_reduced, kept_labels, auto_info

to_reduced, kept_labels, auto_info = build_auto_label_map_auto(train["label"], CFG.min_labels_keep, CFG.min_coverage)
train["label_reduced"] = train["label"].map(to_reduced)
label2idx = {s:i for i,s in enumerate(kept_labels)}
idx2label = {i:s for s,i in label2idx.items()}
train["y"] = train["label_reduced"].map(label2idx)

print(f"[AUTO-REDUCE] kept={len(kept_labels)}, coverage={auto_info['coverage']:.3f}, implicit N={auto_info['N_threshold']}")
print("Sample kept labels:", kept_labels[:10])

In [None]:
# Cell: Build XML or plain text prompts for model input
from jinja2 import Template

XML_PROMPT = Template("""<xml>
  <question>{{ question }}</question>
  <answer_anchor>{{ mc_answer }}</answer_anchor>
  <explanation>{{ explanation }}</explanation>
  <task>
    You are a math misconceptions classifier.
    Output ONLY one label as "Category:Misconception".
  </task>
</xml>
""")

def build_xml_prompt_from_row(row):
    return XML_PROMPT.render(
        question=row.get("QuestionText", ""),
        mc_answer=row.get("MC_Answer", ""),
        explanation=row.get("StudentExplanation", "")
    )

def build_row_input(df):
    rows = []
    for _, r in df.iterrows():
        if CFG.use_xml:
            rows.append(build_xml_prompt_from_row(r))
        else:
            rows.append(
                f"Question:\n{r.get('QuestionText','')}\n"
                f"Answer anchor:\n{r.get('MC_Answer','')}\n"
                f"Student explanation:\n{r.get('StudentExplanation','')}\n"
                f"Label:"
            )
    return rows

train["input_text"] = build_row_input(train)
test["input_text"]  = build_row_input(test)

In [None]:
# Cell: QLoRA base loader and LoRA attach functions
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def load_qlora_base(model_name: str):
    quant = None
    if CFG.load_4bit:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=CFG.bnb_4bit_use_double_quant,
            bnb_4bit_quant_type=CFG.bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=getattr(torch, CFG.bnb_4bit_compute_dtype)
        )
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant,
        torch_dtype=getattr(torch, CFG.bnb_4bit_compute_dtype),
        device_map="auto",
        trust_remote_code=True
    )
    return tok, model

def attach_lora(model):
    peft_cfg = LoraConfig(
        r=CFG.lora_r,
        lora_alpha=CFG.lora_alpha,
        lora_dropout=CFG.lora_dropout,
        target_modules=CFG.target_modules,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_cfg)
    return model

tok, base = load_qlora_base(CFG.base_model)
model = attach_lora(base)

In [None]:
# Cell: Convert labels to text and prepare SFT (source, target) pairs
IGNORE_INDEX = -100

def label_to_text(idx: int) -> str:
    return idx2label[int(idx)]

def make_sft_pairs(df):
    sources, targets = [], []
    for _, r in df.iterrows():
        sources.append(r["input_text"])
        targets.append(label_to_text(int(r["y"])))
    return sources, targets

def tokenize_sft(sources, targets, tok, max_source_len=1024, max_label_len=32):
    inputs = tok(sources, padding=True, truncation=True, max_length=max_source_len, return_tensors=None)
    target_ids = tok(targets, padding=True, truncation=True, max_length=max_label_len, add_special_tokens=False)["input_ids"]
    input_ids, labels, attention_mask = [], [], []
    for i in range(len(sources)):
        src_ids = inputs["input_ids"][i]
        am     = inputs["attention_mask"][i]
        tgt_ids = target_ids[i] + [tok.eos_token_id]
        if len(src_ids) + len(tgt_ids) > max_source_len:
            cut = max_source_len - len(tgt_ids)
            src_ids = src_ids[:cut]
            am     = am[:cut]
        merged = src_ids + tgt_ids
        lbls   = [IGNORE_INDEX]*len(src_ids) + tgt_ids
        am2    = am + [1]*len(tgt_ids)
        input_ids.append(merged)
        labels.append(lbls)
        attention_mask.append(am2)
    return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}

class LMData(torch.utils.data.Dataset):
    def __init__(self, enc): self.enc = enc
    def __len__(self): return len(self.enc["input_ids"])
    def __getitem__(self, i):
        return {k: torch.tensor(v[i]) for k,v in self.enc.items()}

tr_src, tr_tgt = make_sft_pairs(train)
tr_enc = tokenize_sft(tr_src, tr_tgt, tok, max_source_len=CFG.max_source_len, max_label_len=CFG.max_label_len)
tr_ds  = LMData(tr_enc)

In [None]:
# Cell: Single-run training (Trainer) and save LoRA adapter
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
os.makedirs(CFG.out_dir, exist_ok=True)
out_dir = os.path.join(CFG.out_dir, f"{CFG.run_name}_{CFG.base_model.split('/')[-1]}")

args = TrainingArguments(
    output_dir=out_dir,
    num_train_epochs=CFG.epochs,
    per_device_train_batch_size=CFG.train_bs,
    gradient_accumulation_steps=CFG.grad_accum,
    learning_rate=CFG.lr,
    warmup_ratio=CFG.warmup_ratio,
    weight_decay=CFG.weight_decay,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    eval_strategy="no",
    bf16=("A100" in (torch.cuda.get_device_name(0) if torch.cuda.is_available() else "")),
    fp16=("A100" not in (torch.cuda.get_device_name(0) if torch.cuda.is_available() else "")),
    gradient_checkpointing=True,
    report_to="none"
)

collator = DataCollatorForLanguageModeling(tok, mlm=False)
trainer = Trainer(model=model, args=args, train_dataset=tr_ds, data_collator=collator)
trainer.train()
trainer.save_model(out_dir)
print("Saved LoRA adapter to:", out_dir)

In [None]:
# Cell: Decode predictions to label id and HF generate top-3 helper
def decode_to_label_id(text):
    s = text.strip()
    if s in label2idx: return label2idx[s]
    for lbl in label2idx:
        if s.startswith(lbl): return label2idx[lbl]
    return None

def predict_top3_hf(prompts, model, tok, max_new_tokens=16):
    preds = []
    model.eval()
    for p in prompts:
        ins = tok(p, return_tensors="pt").to(next(model.parameters()).device)
        with torch.no_grad():
            gen = model.generate(
                **ins,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                num_beams=3,
                num_return_sequences=3,
                early_stopping=True
            )
        outs = [tok.decode(gen[i][ins["input_ids"].shape[1]:], skip_special_tokens=True) for i in range(gen.shape[0])]
        uniq = []
        for t in outs:
            lid = decode_to_label_id(t)
            if lid is not None and lid not in uniq:
                uniq.append(lid)
            if len(uniq) == 3: break
        while len(uniq) < 3:
            uniq.append(uniq[0] if uniq else 0)
        preds.append(uniq[:3])
    return preds

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Holdout MAP@3: 0.4317


In [None]:
# Cell: Small hold-out evaluation and write submission
from sklearn.model_selection import train_test_split
tr, va = train_test_split(train, test_size=0.1, stratify=train["y"], random_state=CFG.seed)

va_top3 = predict_top3_hf(va["input_text"].tolist(), model, tok)
val_map = map_at_3(va["y"].tolist(), va_top3)
print(f"Holdout MAP@3: {val_map:.4f}")

# Predict on test and write submission.csv
test_top3 = predict_top3_hf(test["input_text"].tolist(), model, tok)

def id_to_label_text(idx): return idx2label[int(idx)]

rows = []
for (row_id, preds) in zip(test["row_id"].tolist(), test_top3):
    labels = [id_to_label_text(p) for p in preds]
    rows.append({"row_id": row_id, "Category:Misconception": " ".join(labels)})

sub = pd.DataFrame(rows, columns=["row_id","Category:Misconception"])
sub.to_csv("submission.csv", index=False)
print("Wrote submission.csv")
sub.head()

In [None]:
# Cell: Optional - 5-fold cross-validation loop (runs full train per fold; heavy)
from sklearn.model_selection import StratifiedKFold
cv_scores = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=CFG.seed)

for fold, (tr_idx, va_idx) in enumerate(skf.split(train, train["y"])):
    print(f"\n=== Fold {fold+1}/5 ===")
    tr_df = train.iloc[tr_idx].reset_index(drop=True)
    va_df = train.iloc[va_idx].reset_index(drop=True)

    tok_f, base_f = load_qlora_base(CFG.base_model)
    model_f = attach_lora(base_f)

    tr_src, tr_tgt = make_sft_pairs(tr_df)
    va_src, va_tgt = make_sft_pairs(va_df)
    tr_enc = tokenize_sft(tr_src, tr_tgt, tok_f, CFG.max_source_len, CFG.max_label_len)
    va_enc = tokenize_sft(va_src, va_tgt, tok_f, CFG.max_source_len, CFG.max_label_len)

    tr_ds, va_ds = LMData(tr_enc), LMData(va_enc)
    out_dir_f = os.path.join(CFG.out_dir, f"{CFG.run_name}_{CFG.base_model.split('/')[-1]}_fold{fold}")

    args_f = TrainingArguments(
        output_dir=out_dir_f,
        num_train_epochs=CFG.epochs,
        per_device_train_batch_size=CFG.train_bs,
        gradient_accumulation_steps=CFG.grad_accum,
        learning_rate=CFG.lr,
        warmup_ratio=CFG.warmup_ratio,
        weight_decay=CFG.weight_decay,
        logging_steps=20,
        eval_strategy="steps",
        eval_steps=200,
        save_steps=200,
        save_total_limit=1,
        bf16=("A100" in (torch.cuda.get_device_name(0) if torch.cuda.is_available() else "")),
        fp16=("A100" not in (torch.cuda.get_device_name(0) if torch.cuda.is_available() else "")),
        gradient_checkpointing=True,
        report_to="none"
    )
    collator_f = DataCollatorForLanguageModeling(tok_f, mlm=False)
    trainer_f = Trainer(model=model_f, args=args_f, train_dataset=tr_ds, eval_dataset=va_ds, data_collator=collator_f)
    trainer_f.train()
    trainer_f.save_model(out_dir_f)

    model_f.eval()
    va_top3 = predict_top3_hf(va_df["input_text"].tolist(), model_f, tok_f)
    fold_map = map_at_3(va_df["y"].tolist(), va_top3)
    cv_scores.append(fold_map)
    print(f"Fold {fold} MAP@3: {fold_map:.4f}")

print("CV MAP@3:", [f"{s:.4f}" for s in cv_scores], "avg:", f"{np.mean(cv_scores):.4f}")