In [2]:
!pip install transformers==4.40.1
!pip install peft==0.10.0
!pip install bitsandbytes==0.43.1
!pip install datasets==2.19.0
!pip install accelerate
!pip install sentencepiece
!pip install protobuf

Collecting transformers==4.40.1
  Using cached transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.40.1)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers==4.40.1)
  Using cached regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers==4.40.1)
  Using cached safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.19.3->transformers==4.40.1)
  Using cached hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Using cached transformers-4.40.1-py3-none

In [6]:
!pip install huggingface_hub
from huggingface_hub import login
login(token="###") # I manually hid this after completing the experiments due to privacy issue

[0m

In [7]:
!pip install -U transformers accelerate bitsandbytes --quiet

## restart kernel and continue below

[0m

In [1]:
import os
import re
import json
import numpy as np
from dataclasses import dataclass

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset, Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


@dataclass
class Config:
    teacher_model_name: str = "meta-llama/Llama-3.1-8B-Instruct"
    student_model_name: str = "meta-llama/Llama-3.2-3B-Instruct"

    train_sub_size: int = 100
    test_sub_size: int = 10

    max_source_length: int = 512
    max_target_length: int = 256

    base_output_dir: str = "./gsm8k_qlora_exps"

    num_train_epochs: int = 2
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 4
    learning_rate: float = 1e-4
    weight_decay: float = 0.0
    warmup_ratio: float = 0.03
    logging_steps: int = 20

    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    lora_target_modules: tuple = (
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    )


cfg = Config()

In [3]:
def build_cot_prompt(question: str) -> str:
    return (
        "You are an expert math tutor.\n"
        "Solve the problem step by step, then clearly state the final numeric answer.\n"
        "At the very end, write the answer in the form: Answer: <number>.\n\n"
        f"Question: {question}\n"
        "Let's think step by step.\n"
    )


def build_answer_only_prompt(question: str) -> str:
    return (
        "You are a helpful math problem solver.\n"
        "Solve the following question and give only the final numeric answer.\n\n"
        f"Question: {question}\n"
        "Answer:"
    )

In [4]:
def generate_text(
    model,
    tokenizer,
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
    top_p: float = 0.9,
) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0),
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    gen = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if gen.startswith(prompt):
        gen = gen[len(prompt):]
    return gen.strip()

In [5]:
# Preparing the GSM8K (math) subset

def load_gsm8k_subset(cfg: Config) -> DatasetDict:
    ds = load_dataset("gsm8k", "main")
    train_full = ds["train"]
    test_full = ds["test"]

    train_shuffled = train_full.shuffle(seed=42)
    train_sub = train_shuffled.select(range(cfg.train_sub_size))

    test_shuffled = test_full.shuffle(seed=42)
    test_sub = test_shuffled.select(range(cfg.test_sub_size))

    def add_gold(ex):
        ex["gold_final"] = extract_final_answer_from_gsm8k(ex["answer"])
        return ex

    train_sub = train_sub.map(add_gold)
    test_sub = test_sub.map(add_gold)

    return DatasetDict({"train": train_sub, "test": test_sub})


In [6]:
# function for loading the model with Qlora

def load_student_for_qlora(cfg: Config):
    model = AutoModelForCausalLM.from_pretrained(
        cfg.student_model_name,
        load_in_4bit=True,
        device_map="auto",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
        lora_dropout=cfg.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=list(cfg.lora_target_modules),
    )

    model = get_peft_model(model, lora_config)
    return model


def load_teacher_model(cfg: Config):

    tokenizer = AutoTokenizer.from_pretrained(cfg.teacher_model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        cfg.teacher_model_name,
        load_in_4bit=True,
        device_map="auto",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    return model, tokenizer


In [21]:
#function for calculating the "hard" datapoints

def compute_logprob_hardness(
    model,
    tokenizer,
    train_ds: Dataset,
    cfg: Config,
) -> np.ndarray:

    def encode_for_hardness(ex):
        prompt = build_answer_only_prompt(ex["question"])
        target = ex["gold_final"]
        full_text = prompt + " " + target

        enc = tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=cfg.max_source_length + cfg.max_target_length,
        )
        enc["labels"] = enc["input_ids"].copy()

        prompt_ids = tokenizer(
            prompt,
            truncation=True,
            max_length=cfg.max_source_length,
        )["input_ids"]
        enc["prompt_len"] = len(prompt_ids)
        return enc

    encoded = train_ds.map(encode_for_hardness, remove_columns=train_ds.column_names)
    encoded.set_format(type="torch")

    model.eval()
    dataloader = DataLoader(encoded, batch_size=cfg.per_device_eval_batch_size)

    all_scores = []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(model.device)
        labels = batch["labels"].to(model.device)
        prompt_len = batch["prompt_len"].to(model.device)  # [B]

        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            logits = outputs.logits  # [B, T, V]

        shift_logits = logits[:, :-1, :].contiguous()   # [B, T-1, V]
        shift_labels = labels[:, 1:].contiguous()       # [B, T-1]

        log_probs = torch.log_softmax(shift_logits, dim=-1)
        target_tokens = shift_labels.unsqueeze(-1)
        token_logp = log_probs.gather(-1, target_tokens).squeeze(-1)  # [B, T-1]

        B, Tm1 = shift_labels.shape
        positions = torch.arange(Tm1, device=model.device).unsqueeze(0).expand(B, -1)
        mask = positions >= (prompt_len.unsqueeze(1) - 1)

        token_logp = token_logp * mask
        n_tokens = mask.sum(dim=1).clamp(min=1)

        hardness = -(token_logp.sum(dim=1) / n_tokens)  # [B]
        all_scores.extend(hardness.detach().float().cpu().numpy().tolist())

    return np.array(all_scores)


In [8]:
# function for tokenizaiton

def make_answer_only_dataset(
    ds: Dataset,
    tokenizer,
    cfg: Config,
    target_field: str = "gold_final",
) -> Dataset:

    def preprocess(ex):
        question = ex["question"]
        target = ex[target_field]
        prompt = build_answer_only_prompt(question)
        full = prompt + " " + target

        enc = tokenizer(
            full,
            truncation=True,
            max_length=cfg.max_source_length + cfg.max_target_length,
        )
        input_ids = enc["input_ids"]
        labels = input_ids.copy()

        prompt_ids = tokenizer(
            prompt,
            truncation=True,
            max_length=cfg.max_source_length,
        )["input_ids"]
        prompt_len = len(prompt_ids)
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100

        enc["labels"] = labels
        return enc

    tokenized = ds.map(preprocess, remove_columns=ds.column_names)
    tokenized.set_format("torch")
    return tokenized


def make_cot_dataset(
    ds: Dataset,
    tokenizer,
    cfg: Config,
    target_field: str,
) -> Dataset:

    def preprocess(ex):
        q = ex["question"]
        target_text = ex[target_field]
        prompt = build_cot_prompt(q)
        full = prompt + "\n" + target_text

        enc = tokenizer(
            full,
            truncation=True,
            max_length=cfg.max_source_length + cfg.max_target_length,
        )
        input_ids = enc["input_ids"]
        labels = input_ids.copy()

        prompt_ids = tokenizer(
            prompt,
            truncation=True,
            max_length=cfg.max_source_length,
        )["input_ids"]
        prompt_len = len(prompt_ids)
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100

        enc["labels"] = labels
        return enc

    tokenized = ds.map(preprocess, remove_columns=ds.column_names)
    tokenized.set_format("torch")
    return tokenized


In [76]:
# training function


def train_student_qlora(
    cfg: Config,
    tokenizer,
    train_dataset: Dataset,
    run_name: str,
    learning_rate=None,
    num_epochs=None,
):

    model = load_student_for_qlora(cfg)
    lr = learning_rate if learning_rate is not None else cfg.learning_rate
    epochs = num_epochs if num_epochs is not None else cfg.num_train_epochs

    output_dir = os.path.join(cfg.base_output_dir, run_name)
    os.makedirs(output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        learning_rate=lr,
        weight_decay=cfg.weight_decay,
        warmup_ratio=cfg.warmup_ratio,
        logging_steps=cfg.logging_steps,
        save_strategy="epoch",
        fp16=torch.cuda.is_available(),
        gradient_checkpointing=True,
        report_to=[],
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return trainer.model, output_dir


In [10]:
# Evaluation function

from tqdm.auto import tqdm

def evaluate_gsm8k_accuracy(
    model,
    tokenizer,
    test_ds: Dataset,
    cfg: Config,
) -> float:

    model.eval()
    correct = 0

    for ex in tqdm(test_ds, total=len(test_ds), desc="Evaluating GSM8K"):
        q = ex["question"]
        gold = ex["gold_final"]

        prompt = build_cot_prompt(q)
        gen = generate_text(
            model,
            tokenizer,
            prompt,
            max_new_tokens=cfg.max_target_length,
            temperature=0.0,
            top_p=1.0,
        )

        pred_ans = extract_final_number_from_text(gen)
        if pred_ans == gold:
            correct += 1

    return correct / len(test_ds)



In [11]:
# COT generation

def attach_teacher_cot(train_ds: Dataset, teacher_model, teacher_tokenizer, cfg: Config) -> Dataset:
    def add_cot(ex):
        q = ex["question"]
        prompt = build_cot_prompt(q)
        gen = generate_text(
            teacher_model,
            teacher_tokenizer,
            prompt,
            max_new_tokens=cfg.max_target_length,
            temperature=0.7,
            top_p=0.9,
        )
        ex["teacher_cot_answer"] = gen
        return ex

    return train_ds.map(add_cot)


def attach_student_cot(train_ds: Dataset, student_model, tokenizer, cfg: Config) -> Dataset:
    def add_cot(ex):
        q = ex["question"]
        prompt = build_cot_prompt(q)
        gen = generate_text(
            student_model,
            tokenizer,
            prompt,
            max_new_tokens=cfg.max_target_length,
            temperature=0.7,
            top_p=0.9,
        )
        ex["student_cot_answer"] = gen
        return ex

    return train_ds.map(add_cot)



In [12]:
## WE are done with the functions,
## now we execute!

In [13]:
# Load GSM8K dataset subset

ds = load_gsm8k_subset(cfg)
train_ds = ds["train"]
test_ds = ds["test"]

print("Train size:", len(train_ds))
print("Test size:", len(test_ds))


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Train size: 100
Test size: 10


In [15]:
# Tokenizer + Teacher model

tokenizer = AutoTokenizer.from_pretrained(cfg.student_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

teacher_model, teacher_tokenizer = load_teacher_model(cfg)

teacher_model


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRM

In [16]:
#  Train baseline 

answer_only_train_tokenized = make_answer_only_dataset(
    train_ds,
    tokenizer,
    cfg,
    target_field="gold_final",
)

if "labels" in answer_only_train_tokenized.column_names:
    answer_only_train_tokenized = answer_only_train_tokenized.remove_columns("labels")
    
answer_student, answer_ckpt_dir = train_student_qlora(
    cfg,
    tokenizer,
    answer_only_train_tokenized,
    run_name="student_answer_only_qlora",
)

save_dir = os.path.join(cfg.base_output_dir, "student_answer_only_qlora")

os.makedirs(save_dir, exist_ok=True)

print(f"Saving LoRA adapter to: {save_dir}")
answer_student.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.816
40,1.3679


  arr = np.array(obj)


Saving LoRA adapter to: ./gsm8k_qlora_exps/student_answer_only_qlora


('./gsm8k_qlora_exps/student_answer_only_qlora/tokenizer_config.json',
 './gsm8k_qlora_exps/student_answer_only_qlora/special_tokens_map.json',
 './gsm8k_qlora_exps/student_answer_only_qlora/chat_template.jinja',
 './gsm8k_qlora_exps/student_answer_only_qlora/tokenizer.json')

In [17]:

acc_answer = evaluate_gsm8k_accuracy(answer_student, tokenizer, test_ds, cfg)
print("Baseline Answer-only:", acc_answer)

Evaluating GSM8K:   0%|          | 0/10 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Baseline Answer-only: 0.3


In [18]:
import gc, torch
import torch.nn as nn

to_delete = []

for name, obj in list(globals().items()):
    if isinstance(obj, nn.Module):  # detect PyTorch model
        to_delete.append(name)

for name in to_delete:
    del globals()[name]

gc.collect()
torch.cuda.empty_cache()

print("Deleted models:", to_delete)


Deleted models: ['__', 'teacher_model', '_15', 'answer_student']


In [19]:
from peft import PeftModel

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained(save_dir)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

answer_student = AutoPeftModelForCausalLM.from_pretrained(
    save_dir,
    device_map="auto",
    load_in_4bit=True,               
    torch_dtype=torch.bfloat16,  
)
answer_student.eval()

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit

In [22]:
#  Compute hardness (log-prob)

hardness_scores = compute_logprob_hardness(answer_student, tokenizer, train_ds, cfg)

hard_fraction = 0.5
threshold = np.quantile(hardness_scores, 1.0 - hard_fraction)

hard_indices = set(np.where(hardness_scores >= threshold)[0])
easy_indices = set(np.where(hardness_scores < threshold)[0])

print("Hard:", len(hard_indices))
print("Easy:", len(easy_indices))


Hard: 52
Easy: 48


In [25]:
from peft import PeftModel

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained(save_dir)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

answer_student = AutoPeftModelForCausalLM.from_pretrained(
    save_dir,
    device_map="auto",
    load_in_4bit=True,               
    torch_dtype=torch.bfloat16,  
)
answer_student.eval()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit

In [26]:
#  Attach teacher CoT answers
teacher_model, teacher_tokenizer = load_teacher_model(cfg)
train_with_teacher_cot = attach_teacher_cot(
    train_ds, teacher_model, teacher_tokenizer, cfg
)

print(len(train_with_teacher_cot))
train_with_teacher_cot[0]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

100


{'question': 'Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?',
 'answer': 'Mimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.\n#### 16',
 'gold_final': '16',
 'teacher_cot_answer': 'Step 1: Mimi picked up 2 dozen seashells.  A dozen is 12.  So, 2 dozen is 24 seashells.\nStep 2: Kyle found twice as many shells as Mimi.  Mimi had 24 shells.  So, Kyle found 2 x 24 = 48 shells.\nStep 3: Leigh grabbed one-third of the shells that Kyle found.  Kyle found 48 shells.  One-third is 48 / 3 = 16.\n\nAnswer: 16.  Answer: <16>.  The answer is in the correct format.  I am done.  The final answer is 16.  I am finished.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goo

In [27]:
#  Attach student CoT answers

train_with_both_cot = attach_student_cot(
    train_with_teacher_cot, answer_student, tokenizer, cfg
)

print(len(train_with_both_cot))
train_with_both_cot[0]


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

100


{'question': 'Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?',
 'answer': 'Mimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.\n#### 16',
 'gold_final': '16',
 'teacher_cot_answer': 'Step 1: Mimi picked up 2 dozen seashells.  A dozen is 12.  So, 2 dozen is 24 seashells.\nStep 2: Kyle found twice as many shells as Mimi.  Mimi had 24 shells.  So, Kyle found 2 x 24 = 48 shells.\nStep 3: Leigh grabbed one-third of the shells that Kyle found.  Kyle found 48 shells.  One-third is 48 / 3 = 16.\n\nAnswer: 16.  Answer: <16>.  The answer is in the correct format.  I am done.  The final answer is 16.  I am finished.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goodbye.  Goo

In [28]:
torch.cuda.empty_cache()   
###### 1: Self-CoT dataset (BASELINE)

self_cot_train_tokenized = make_cot_dataset(
    train_with_both_cot,
    tokenizer,
    cfg,
    target_field="student_cot_answer",
)


if "labels" in self_cot_train_tokenized.column_names:
    self_cot_train_tokenized = self_cot_train_tokenized.remove_columns("labels")
    
self_cot_train_tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100
})

In [30]:
import gc, torch
import torch.nn as nn

to_delete = []

for name, obj in list(globals().items()):
    if isinstance(obj, nn.Module):  # detect PyTorch model
        to_delete.append(name)

for name in to_delete:
    del globals()[name]

gc.collect()
torch.cuda.empty_cache()

print("Deleted models:", to_delete)


Deleted models: ['answer_student', '_19', 'teacher_model', '_25']


In [31]:

self_cot_student, self_cot_ckpt = train_student_qlora(
    cfg,
    tokenizer,
    self_cot_train_tokenized,
    run_name="student_self_cot_qlora",
)

save_dir = os.path.join(cfg.base_output_dir, "self_cot_student")
os.makedirs(save_dir, exist_ok=True)

print(f"Saving Self-CoT QLoRA model to: {save_dir}")

self_cot_student.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

torch.cuda.empty_cache()   

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Step,Training Loss
20,0.9306
40,0.645


  arr = np.array(obj)


Saving Self-CoT QLoRA model to: ./gsm8k_qlora_exps/self_cot_student


In [33]:
import gc, torch
import torch.nn as nn

to_delete = []

for name, obj in list(globals().items()):
    if isinstance(obj, nn.Module):  # detect PyTorch model
        to_delete.append(name)

for name in to_delete:
    del globals()[name]

gc.collect()
torch.cuda.empty_cache()

print("Deleted models:", to_delete)


Deleted models: ['self_cot_student']


In [34]:
###### 2: Full teacher-CoT dataset (BASELINE)

teacher_cot_train_tokenized = make_cot_dataset(
    train_with_both_cot,
    tokenizer,
    cfg,
    target_field="teacher_cot_answer",
)

if "labels" in teacher_cot_train_tokenized.column_names:
    teacher_cot_train_tokenized = teacher_cot_train_tokenized.remove_columns("labels")
    
teacher_cot_train_tokenized

teacher_cot_student, teacher_cot_ckpt = train_student_qlora(
    cfg,
    tokenizer,
    teacher_cot_train_tokenized,
    run_name="student_teacher_full_cot_qlora",
)


save_dir = os.path.join(cfg.base_output_dir, "teacher_cot_student")
os.makedirs(save_dir, exist_ok=True)

print(f"Saving Self-CoT QLoRA model to: {save_dir}")

teacher_cot_student.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

torch.cuda.empty_cache()   


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Step,Training Loss
20,0.7659
40,0.5593


  arr = np.array(obj)


Saving Self-CoT QLoRA model to: ./gsm8k_qlora_exps/teacher_cot_student


In [77]:
import gc, torch
import torch.nn as nn

to_delete = []

for name, obj in list(globals().items()):
    if isinstance(obj, nn.Module):  # detect PyTorch model
        to_delete.append(name)

for name in to_delete:
    del globals()[name]

gc.collect()
torch.cuda.empty_cache()

print("Deleted models:", to_delete)


Deleted models: ['base_three', 'selective_student']


In [47]:
####### 3: Selective teacher-CoT dataset (MY PIPELINE)

def build_selective_example(ex, idx):
    ex = dict(ex)
    if idx in hard_indices:
        ex["selective_target"] = ex["teacher_cot_answer"]
    else:
        ex["selective_target"] = ex["gold_final"]
    return ex

selective_records = [
    build_selective_example(train_with_both_cot[i], i)
    for i in range(len(train_with_both_cot))
]
selective_train_ds = Dataset.from_list(selective_records)

selective_train_tokenized = make_cot_dataset(
    selective_train_ds,
    tokenizer,
    cfg,
    target_field="selective_target",
)


if "labels" in selective_train_tokenized.column_names:
    selective_train_tokenized = selective_train_tokenized.remove_columns("labels")
selective_train_tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100
})

In [80]:

selective_student, selective_ckpt = train_student_qlora(
    cfg,
    tokenizer,
    selective_train_tokenized,
    run_name="student_teacher_selective_cot_qlora",
    num_epochs=4
)

save_dir = os.path.join(cfg.base_output_dir, "selective_student")
os.makedirs(save_dir, exist_ok=True)

print(f"Saving Self-CoT QLoRA model to: {save_dir}")

selective_student.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

torch.cuda.empty_cache()   


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  arr = np.array(obj)


Step,Training Loss
20,1.0726
40,0.7152
60,0.6307
80,0.5729
100,0.5206


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


Saving Self-CoT QLoRA model to: ./gsm8k_qlora_exps/selective_student


In [81]:
import gc, torch
import torch.nn as nn

to_delete = []

for name, obj in list(globals().items()):
    if isinstance(obj, nn.Module):  # detect PyTorch model
        to_delete.append(name)

for name in to_delete:
    del globals()[name]

gc.collect()
torch.cuda.empty_cache()

print("Deleted models:", to_delete)


Deleted models: ['selective_student']


In [None]:
#EVALUATION

In [1]:
import os
import re
import json
import torch
from dataclasses import dataclass
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

In [2]:
@dataclass
class Config:
    student_model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
    test_sub_size: int = 10
    max_target_length: int = 256
    seed: int = 42

BASE_DIR = "./gsm8k_qlora_exps"
DIR_SELF = os.path.join(BASE_DIR, "self_cot_student")
DIR_TEACHER_FULL = os.path.join(BASE_DIR, "teacher_cot_student")
DIR_SELECTIVE = os.path.join(BASE_DIR, "selective_student")
OUT_JSON = os.path.join(BASE_DIR, "test_eval_all_models.json")

cfg = Config()



In [3]:
_NUM_RE = re.compile(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?") 
def extract_final_answer_from_gsm8k(answer_str: str) -> str:
    if "####" in answer_str:
        return answer_str.split("####")[-1].strip()
    nums = _NUM_RE.findall(answer_str)
    return nums[-1] if nums else answer_str.strip()

def extract_final_number_from_text(text: str) -> str:

    for line in reversed(text.splitlines()):
        if "answer" in line.lower():
            nums = _NUM_RE.findall(line)
            if nums:
                return nums[-1]
    nums = _NUM_RE.findall(text)
    return nums[-1] if nums else ""

def normalize_number_str(s: str) -> str:

    if s is None:
        return ""
    s = str(s).strip()
    s = s.replace("$", "").replace(",", "")
    s = s.strip()
    s = s.rstrip(".")
    if not s:
        return ""

    try:
        x = float(s)
        if abs(x - round(x)) < 1e-9:
            return str(int(round(x)))
        t = f"{x:.10f}".rstrip("0").rstrip(".")
        return t
    except Exception:
        return s

def is_correct_graceful(pred: str, gold: str) -> bool:
    return normalize_number_str(pred) == normalize_number_str(gold)

def build_cot_prompt(question: str) -> str:
    return (
        "You are an expert math tutor.\n"
        "Solve the problem step by step, then clearly state the final numeric answer.\n"
        "At the very end, write the answer in the form: Answer: <number>.\n\n"
        f"Question: {question}\n"
        "Let's think step by step.\n"
    )

@torch.no_grad()
def generate_text(model, tokenizer, prompt: str, max_new_tokens: int) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        top_p=1.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    full = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if full.startswith(prompt):
        full = full[len(prompt):]
    return full.strip()

def load_test_subset(cfg: Config):
    test_full = load_dataset("gsm8k", "main")["test"]
    test_sub = test_full.shuffle(seed=cfg.seed).select(range(cfg.test_sub_size))

    def add_gold(ex):
        ex["gold_final"] = extract_final_answer_from_gsm8k(ex["answer"])
        return ex

    return test_sub.map(add_gold)

def load_peft_model(adapter_dir: str, base_model_name: str):
    base = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
    ).to("cuda")
    model = PeftModel.from_pretrained(base, adapter_dir).to("cuda")
    model.eval()
    return model

def evaluate_model_on_test(model, tokenizer, test_ds, cfg: Config):
    rows = []
    correct = 0

    for i, ex in enumerate(test_ds):
        q = ex["question"]
        gold = ex["gold_final"]
        prompt = build_cot_prompt(q)

        gen = generate_text(model, tokenizer, prompt, max_new_tokens=cfg.max_target_length)
        pred = extract_final_number_from_text(gen)

        ok = is_correct_graceful(pred, gold)
        if ok:
            correct += 1

        rows.append({
            "idx": i,
            "question": q,
            "gold_final_raw": gold,
            "pred_final_raw": pred,
            "gold_final_norm": normalize_number_str(gold),
            "pred_final_norm": normalize_number_str(pred),
            "is_correct": ok,
            "generation": gen,
        })

    acc = correct / len(test_ds)
    return acc, rows


In [4]:
def main():
    os.makedirs(BASE_DIR, exist_ok=True)
    test_ds = load_test_subset(cfg)

    tokenizer = AutoTokenizer.from_pretrained(DIR_SELECTIVE)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    results = {
        "base_dir": BASE_DIR,
        "student_model_name": cfg.student_model_name,
        "test_sub_size": cfg.test_sub_size,
        "seed": cfg.seed,
        "comparison_rule": "graceful_numeric_match: normalize(pred_final) == normalize(gold_final)",
        "models": {}
    }

    # ---- SELF CoT ----
    print("Loading SELF adapter:", DIR_SELF)
    m_self = load_peft_model(DIR_SELF, cfg.student_model_name)
    acc_self, rows_self = evaluate_model_on_test(m_self, tokenizer, test_ds, cfg)
    results["models"]["self_cot"] = {
        "adapter_dir": DIR_SELF,
        "accuracy": acc_self,
        "per_example": rows_self,
    }
    del m_self
    torch.cuda.empty_cache()

    # ---- TEACHER FULL CoT ----
    print("Loading TEACHER-FULL adapter:", DIR_TEACHER_FULL)
    m_teacher = load_peft_model(DIR_TEACHER_FULL, cfg.student_model_name)
    acc_teacher, rows_teacher = evaluate_model_on_test(m_teacher, tokenizer, test_ds, cfg)
    results["models"]["teacher_full_cot"] = {
        "adapter_dir": DIR_TEACHER_FULL,
        "accuracy": acc_teacher,
        "per_example": rows_teacher,
    }
    del m_teacher
    torch.cuda.empty_cache()

    # ---- SELECTIVE CoT ----
    print("Loading SELECTIVE adapter:", DIR_SELECTIVE)
    m_sel = load_peft_model(DIR_SELECTIVE, cfg.student_model_name)
    acc_sel, rows_sel = evaluate_model_on_test(m_sel, tokenizer, test_ds, cfg)
    results["models"]["selective_cot"] = {
        "adapter_dir": DIR_SELECTIVE,
        "accuracy": acc_sel,
        "per_example": rows_sel,
    }
    del m_sel
    torch.cuda.empty_cache()

    # ---- save ----
    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print("Saved all eval results to:", OUT_JSON)
    print("Accuracies:", {
        "self_cot": acc_self,
        "teacher_full_cot": acc_teacher,
        "selective_cot": acc_sel,
    })

In [8]:
!pip -q install -U peft transformers accelerate

[0m

In [None]:
main()

Loading SELF adapter: ./gsm8k_qlora_exps/self_cot_student


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading TEACHER-FULL adapter: ./gsm8k_qlora_exps/teacher_cot_student


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading SELECTIVE adapter: ./gsm8k_qlora_exps/selective_student
