<a href="https://colab.research.google.com/github/Maximi652/efficient-slm-architectures/blob/main/Qwen3-4B_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning LoRA

In [None]:
# libs
!pip install -q transformers datasets peft accelerate

import json
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig
import torch

# Model
model_name = "/content/drive/MyDrive/Colab Notebooks/Qwen3-4B"

# train
json_path = "/content/drive/MyDrive/Colab Notebooks/12B_trainingdata.json"

# tokenzizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Use left padding to avoid decoder-only right-padding issues
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Modell
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

# chat Template, /no_think
def format_entry_chat(entry):

    qtext = entry["body"].strip()
    qtype = entry.get("type", "factoid").lower()
    ctx = "\n".join(s["text"].strip() for s in entry.get("snippets", []))

    results = []

    # ausformulierte, ganze antwort
    ideal_ans = entry.get("ideal_answer", "")[0].strip()
    if ideal_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide one-sentence ideal answer in English starting with 'Yes,' or 'No,'."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide an ideal answer in English (one paragraph, max 200 words, full sentences)."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": ideal_ans}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    # kurze, knappe Antwort
    exact = entry.get("exact_answer", [])
    flat_exact = [item[0] if isinstance(item, list) and item else item for item in exact]
    exact_ans = ", ".join(flat_exact).strip()

    if exact_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nAnswer only 'yes' or 'no', in English, no extras."
        elif qtype == "factoid":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide up to 5 keywords, comma-separated, in English, no commentary."
        elif qtype == "list":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a comma-separated list of relevant items, in English, no filler words."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a brief answer in English."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": exact_ans}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    return results

# Train laden
with open(json_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)["questions"]

formatted = []
for entry in raw_data:
    formatted.extend(format_entry_chat(entry))

dataset = DatasetDict({
    "train": Dataset.from_list(formatted)
})

# tokenisieren
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024 # ggf 2048, falls GPU-RAM zulässt
    )

tokenized = dataset.map(tokenize, batched=True)

# Start LoRA
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable() # RAM sparen

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)

# Training
training_args = TrainingArguments(
    output_dir="./qwen3-4b-lora-nothink",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=3,
    num_train_epochs=3,
    learning_rate=2e-4,
    eval_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=25,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

trainer.train()

# save
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/qwen3-4b-lora-nothink")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/qwen3-4b-lora-nothink")

print("Training abgeschlossen – Modell unter ./qwen3-4b-lora-nothink gespeichert.")

# Inferenz

In [None]:
# Install und Imports
!pip install -q transformers datasets peft accelerate

import json
from datasets import Dataset
import torch
from torch import inference_mode
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from peft import PeftModel

# Pfade und Gerät
model_name = "/content/drive/MyDrive/Colab Notebooks/Qwen3-4B"
lora_adapter_path = "/content/drive/MyDrive/Colab Notebooks/qwen3-4b-lora-nothink"
test_json_path = "/content/drive/MyDrive/Colab Notebooks/12b_golden_testdata.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Basis-Modell und LoRA-Adapter
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, lora_adapter_path, torch_dtype="auto")
model.gradient_checkpointing_disable()
model.eval()
model = torch.compile(model)
model.to(device)

gen_conf = GenerationConfig(
    max_new_tokens=200,   # Limitiere die generierten Tokens
    do_sample=False,      # Greedy- statt Sample-Decoding
    use_cache=True,       # Aktiviert KV-Caching
    early_stopping=True   # Stoppt, wenn EOS erzeugt wird
)

# Testdaten parsen und formatieren
def format_entry_chat(entry):
    qtext = entry["body"].strip()
    qtype = entry.get("type", "factoid").lower()
    ctx = "\n".join(s["text"].strip() for s in entry.get("snippets", []))
    results = []

    # Ausformulierte Antwort
    ideal_ans = entry.get("ideal_answer", [""])[0].strip()
    if ideal_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide one-sentence ideal answer in English starting with 'Yes,' or 'No,'."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide an ideal answer in English (one paragraph, max 200 words, full sentences)."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    # Kurzantwort
    exact = entry.get("exact_answer", [])
    flat_exact = [item[0] if isinstance(item, list) and item else item for item in exact]
    exact_ans = ", ".join(flat_exact).strip()
    if exact_ans:
        if qtype == "yesno":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nAnswer only 'yes' or 'no', in English, no extras."
        elif qtype == "factoid":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide up to 5 keywords, comma-separated, in English, no commentary."
        elif qtype == "list":
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a comma-separated list of relevant items, in English, no filler words."
        else:
            user_msg = f"Question: {qtext}\nContext:\n{ctx}\nProvide a brief answer in English."
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": user_msg},
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        results.append({"text": text})

    return results

with open(test_json_path, "r", encoding="utf-8") as f:
    raw_test = json.load(f)["questions"]

# Flatten
formatted_test = []
for entry in raw_test:
    formatted_test.extend(format_entry_chat(entry))

# Inference
def generate_answer(prompt_text, max_new_tokens=256, **gen_kwargs):
    with inference_mode():
        inputs = tokenizer(
            prompt_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=1024
        ).to(device)
        # Hier kannst du Sampling-Parameter anpassen
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            generation_config=gen_conf,
            pad_token_id=tokenizer.eos_token_id,
            **gen_kwargs
        )
        # Den generierten Text ab der Eingabelänge decodieren
        gen = output_ids[0, inputs["input_ids"].shape[-1]:]
        return tokenizer.decode(gen, skip_special_tokens=True).strip()

# Loop über den Testdatensatz
results = []
for idx, ex in enumerate(formatted_test):
    prompt = ex["text"]
    pred = generate_answer(prompt)
    results.append({
        "index": idx,
        "prompt": prompt,
        "prediction": pred
    })
    if idx % 20 == 0:
        print(f"Processed {idx}/{len(formatted_test)}")

# 8. Ergebnisse speichern oder auswerten
import json
with open("/content/drive/MyDrive/Colab Notebooks/test_predictions_LoRA.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Inference abgeschlossen – Ergebnisse in test_predictions_LoRA.json gespeichert.")
