In [16]:
!pip install -U transformers 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [1]:
import os
import pandas as pd
import torch
from datasets import Dataset
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

load_dotenv()
hf_token = os.getenv("HUGGINGFACE_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ====== Load dataset ======
def load_partition(path: str) -> Dataset:
    df = pd.read_csv(path).head(10)
    return Dataset.from_pandas(df)

dataset = load_partition("../Student_Training_Data/GPT.csv") ## should be GPT.csv
print(f"Loaded {len(dataset)} samples from dataset.") 

Loaded 10 samples from dataset.


In [4]:
# ====== Tokenizer & Model Setup ======
model_id = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config) # TODO Why getting PEFT model? Paper and Reference notebook did not use


In [None]:
# ====== Format data ======
def format_for_distillation(examples):
    prompts, reasonings = [], []
    for text, reasoning, classification in zip(examples["string"], examples["reasoning"], examples["model_classification"]): # TODO Rows are: sectionName,string,unique_id,model_classification,reasoning
        prompt = (f"Classify the following scientific text as one of [background, method, result].\n\n"
                  f"Text: {text}\n"
                  f"Provide your classification and reasoning in JSON format.")
        response = f'{{"classification": "{classification}", "reasoning": "{reasoning}"}}'
        full_text = prompt + " " + response # TODO The reasoning and response should not be tokenized with the inputs. Rationale should be tokenized separately and the label (classification) does not even need to be tokenized, as we will just CE loss on it.
        tokenized = tokenizer(full_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt") 

        input_ids = tokenized["input_ids"][0]
        labels = input_ids.clone() # TODO The labels should be the classification, not the entire input_ids, much less the entire tokenized input (prompt + responses) 

        start_index = len(tokenizer(prompt)["input_ids"])
        labels[:start_index] = -100

        prompts.append({
            "input_ids": input_ids, # indices of tokens in the tokenizer's vocabulary
            "attention_mask": tokenized["attention_mask"][0],
            "labels": labels, # TODO labels need to be converted to numerical classification for CrossEntropy loss later.
            "student_reasoning": reasoning
            # TODO Requires adding of the reasoning (tokenized) as well as it's input_ids. 
        })

    return {
        "input_ids": torch.stack([p["input_ids"] for p in prompts]),
        "attention_mask": torch.stack([p["attention_mask"] for p in prompts]),
        "labels": torch.stack([p["labels"] for p in prompts]),
        "student_reasoning": [p["student_reasoning"] for p in prompts]
    }

tokenized_dataset = dataset.map(format_for_distillation, batched=True, remove_columns=["unique_id"]) 

Map: 100%|██████████| 10/10 [00:00<00:00, 310.73 examples/s]


In [None]:
# ====== Training Args ======
training_args = TrainingArguments(
    output_dir="gemma3-phase1",
    num_train_epochs=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    max_steps=10,  
    logging_steps=1,
    save_strategy="no",
    remove_unused_columns=False,
    max_grad_norm=1.0,
    report_to="none"
)

trainer = Trainer( # TODO Need to specify the loss function for the trainer. 
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
) 

trainer.train() # TODO Why is training here when the loss function is defined below? 
model.save_pretrained("gemma3-phase1")
tokenizer.save_pretrained("gemma3-phase1")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,12.6432
2,14.2011
3,12.4591
4,13.9257
5,13.1866
6,12.7412
7,11.1731
8,13.4426
9,10.3986
10,13.4266


('gemma3-phase1/tokenizer_config.json',
 'gemma3-phase1/special_tokens_map.json',
 'gemma3-phase1/tokenizer.json')

In [7]:
import torch.nn.functional as F

class ReasoningDistiller(Trainer): 
    def __init__(self, *args, reasoning_weight=0.5, use_reasoning_loss=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.reasoning_weight = reasoning_weight
        self.use_reasoning_loss = use_reasoning_loss

        self.reasoning_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.reasoning_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model( # TODO Forward pass needs to be on prompt and citation without the teacher response and classification
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            labels=inputs["labels"]
        )
        
        ce_loss = outputs.loss

        if self.use_reasoning_loss and "student_reasoning" in inputs:
            try:
                generated = model.generate(inputs["input_ids"], max_length=512)
                decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
                student_reasonings = [self.extract_reasoning(txt) for txt in decoded]
                teacher_reasonings = inputs["student_reasoning"]

                student_embeds = self.get_embeddings(student_reasonings)
                teacher_embeds = self.get_embeddings(teacher_reasonings)
                cosine_loss = 1 - F.cosine_similarity(student_embeds, teacher_embeds).mean()
                total_loss = ce_loss + self.reasoning_weight * cosine_loss
            except Exception as e:
                print(f"Skipping cosine loss due to error: {e}")
                total_loss = ce_loss
        else:
            total_loss = ce_loss

        return (total_loss, outputs) if return_outputs else total_loss

    def extract_reasoning(self, text):
        match = re.search(r'"reasoning"\s*:\s*"(.+?)"\s*}', text)
        return match.group(1).strip() if match else ""

    def get_embeddings(self, texts):
        inputs = self.reasoning_tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        with torch.no_grad():
            return self.reasoning_model(**inputs).last_hidden_state[:, 0, :]

In [None]:
# querying the trained model


In [8]:
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained("llama-student-phase1")
model = PeftModel.from_pretrained(model, "llama-student-phase1")

trainer = ReasoningDistiller(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    reasoning_weight=0.5,
    use_reasoning_loss=True
)

trainer.train()
model.save_pretrained("llama-student-phase2")
tokenizer.save_pretrained("llama-student-phase2")

OSError: llama-student-phase1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`