<a href="https://colab.research.google.com/github/Maximi652/efficient-slm-architectures/blob/main/Qwen3-4B_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning LoRA

In [None]:
# libs
!pip install -q transformers datasets peft accelerate

import json
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig
import torch

# Model
model_name = "/content/drive/MyDrive/Colab Notebooks/Qwen3-4B"

# train
json_path = "/content/drive/MyDrive/Colab Notebooks/12B_trainingdata.json"

# tokenzizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Use left padding to avoid decoder-only right-padding issues
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Modell
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

# chat Template, /no_think
def format_entry_chat(entry):

    qtext = entry["body"].strip()
    qtype = entry.get("type", "factoid").lower()
    ctx = "\n".join(s["text"].strip() for s in entry.get("snippets", []))

    results = []

    # ausformulierte, ganze antwort
    ideal_ans = entry.get("ideal_answer", "")[0].strip()
    if ideal_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide one-sentence ideal answer in English starting with 'Yes,' or 'No,'."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide an ideal answer in English (one paragraph, max 200 words, full sentences)."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": ideal_ans}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    # kurze, knappe Antwort
    exact = entry.get("exact_answer", [])
    flat_exact = [item[0] if isinstance(item, list) and item else item for item in exact]
    exact_ans = ", ".join(flat_exact).strip()

    if exact_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nAnswer only 'yes' or 'no', in English, no extras."
        elif qtype == "factoid":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide up to 5 keywords, comma-separated, in English, no commentary."
        elif qtype == "list":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a comma-separated list of relevant items, in English, no filler words."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a brief answer in English."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": exact_ans}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    return results

# Train laden
with open(json_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)["questions"]

formatted = []
for entry in raw_data:
    formatted.extend(format_entry_chat(entry))

dataset = DatasetDict({
    "train": Dataset.from_list(formatted)
})

# tokenisieren
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024 # ggf 2048, falls GPU-RAM zulässt
    )

tokenized = dataset.map(tokenize, batched=True)

# Start LoRA
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable() # RAM sparen

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)

# Training
training_args = TrainingArguments(
    output_dir="./qwen3-4b-lora-nothink",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=3,
    num_train_epochs=3,
    learning_rate=2e-4,
    eval_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=25,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

trainer.train()

# save
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/qwen3-4b-lora-nothink")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/qwen3-4b-lora-nothink")

print("Training abgeschlossen – Modell unter ./qwen3-4b-lora-nothink gespeichert.")

# Inferenz

In [None]:
import torch
import json
import re
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

# model + data
MODEL_PATH = "./qwen3-4b-lora-nothink"
TEST_PATH  = "/content/drive/MyDrive/Colab Notebooks/12B_golden_testdata.json"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/qwen3-4b_lora_predictions.json"

# tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model.eval()

# gen config
gen_conf = GenerationConfig(
    max_new_tokens=200,
    do_sample=False
)

# Prompt-Builder (wie beim Training)
def build_messages(qtext, snippets, qtype, mode="ideal"):
    ctx = "\n".join(s["text"].strip() for s in snippets[:2])
    if mode == "exact":
        if qtype == "yesno":
            content = f"Question: {qtext}\nContext:\n{ctx}\nAnswer only 'yes' or 'no', in English, no extras."
        elif qtype == "factoid":
            content = f"Question: {qtext}\nContext:\n{ctx}\nProvide up to 5 keywords, comma-separated, in English, no commentary."
        elif qtype == "list":
            content = f"Question: {qtext}\nContext:\n{ctx}\nProvide a comma-separated list of relevant items, in English, no filler words."
        else:
            content = f"Question: {qtext}\nContext:\n{ctx}\nProvide a brief answer in English."
    else:
        if qtype == "yesno":
            content = f"Question: {qtext}\nContext:\n{ctx}\nProvide one-sentence ideal answer in English starting with 'Yes,' or 'No,'."
        else:
            content = f"Question: {qtext}\nContext:\n{ctx}\nProvide an ideal answer in English (one paragraph, max 200 words, full sentences)."

    return [
        {"role": "system", "content": "/no_think"},
        {"role": "user", "content": content}
    ]

# Antworten bereinigen
def clean_output(text, qtype, mode):
    text = text.strip()
    text = re.sub(r"</think>", "", text)
    text = re.sub(r"\s*(Okay\.?|etc\.?|usw\.?|\.\.\.)$", "", text, flags=re.IGNORECASE)

    if mode == "exact":
        if qtype == "yesno":
            return "yes" if text.lower().startswith("yes") else "no"
        if qtype in ("factoid", "list"):
            return [i.strip() for i in text.split(",") if i.strip()]
    else:
        sentences = re.split(r"(?<=[.!?])\s+", text)
        if qtype == "yesno":
            return sentences[0].strip()
        out = []
        total = 0
        for sent in sentences:
            length = len(sent.split())
            if total + length <= 200:
                out.append(sent)
                total += length
            else:
                break
        return " ".join(out).strip()
    return text.strip()

# Testdaten laden
with open(TEST_PATH, "r", encoding="utf-8") as f:
    test_data = json.load(f)["questions"]

submission = []
batch_size = 4

# Inferenz für beide Modi
for mode in ["exact", "ideal"]:
    for i in tqdm(range(0, len(test_data), batch_size), desc=f"Inferenz ({mode})"):
        batch = test_data[i:i+batch_size]
        messages = [build_messages(q["body"], q.get("snippets", []), q["type"], mode=mode) for q in batch]
        prompts = [
            tokenizer.apply_chat_template(m, tokenize=False, add_generation_prompt=True, enable_thinking=False)
            for m in messages
        ]
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

        with torch.no_grad():
            outputs = model.generate(**inputs, generation_config=gen_conf)

        for idx, q in enumerate(batch):
            start = inputs["input_ids"].shape[1]
            ids = outputs[idx][start:].tolist()
            decoded = tokenizer.decode(ids, skip_special_tokens=True)
            cleaned = clean_output(decoded, q["type"], mode)

            # Füge Ergebnis hinzu oder aktualisiere bestehendes
            qid = q["id"]
            existing = next((x for x in submission if x["id"] == qid), None)
            if existing:
                existing[f"{mode}_prediction"] = cleaned
            else:
                submission.append({
                    "id": qid,
                    "type": q["type"],
                    "exact_answer": q.get("exact_answer"),
                    "ideal_answer": q.get("ideal_answer"),
                    f"{mode}_prediction": cleaned
                })

# Ergebnisse speichern
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(submission, f, indent=2, ensure_ascii=False)

print(f"Inferenz abgeschlossen. Ergebnisse gespeichert unter: {OUTPUT_PATH}")