# 4.0 - Finetune Qwen3 0.6B on Orange QA train data

In [1]:
import os
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# --- CONFIGURATION ---
MODEL_ID = "Qwen/Qwen3-0.6B"
OUTPUT_DIR = os.path.join(os.getcwd(), '..', 'models', 'orange_qa_finetuned_Qwen3-0.6B_DoRA_qkvogdu')
DATA_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_train.jsonl')

# 1. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Fix: Qwen has no default pad token
tokenizer_standard_tokens = len(tokenizer)

dataset = load_dataset("json", data_files=DATA_FILE, split="train")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch.float16,
)

peft_config = LoraConfig(
    r=4,        # Rank (Higher = more parameters to train, smarter but slower)
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],# "gate_proj", "down_proj", "up_proj"],
    use_dora=True, # <--- This enables DoRA (Better learning than standard LoRA)
)

model_dora = get_peft_model(model, peft_config)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
tokenizer(['Select Rows widget'])

{'input_ids': [[3379, 61706, 9086]], 'attention_mask': [[1, 1, 1]]}

In [4]:
new_tokens = ['Select Rows']
tokenizer.add_tokens(new_tokens)

1

In [5]:
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen3-0.6B', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=Fa

In [6]:
model_dora.base_model.model.model.embed_tokens, tokenizer_standard_tokens

(Embedding(151936, 1024), 151669)

In [7]:
model_dora.base_model.model.model.embed_tokens.weight

Parameter containing:
tensor([[-0.0093,  0.0337, -0.0747,  ...,  0.0120, -0.0106,  0.0160],
        [ 0.0320,  0.0238, -0.0593,  ..., -0.0023, -0.0349,  0.0090],
        [ 0.0267,  0.0339, -0.0198,  ..., -0.0099,  0.0063,  0.0226],
        ...,
        [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057],
        [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057],
        [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057]],
       device='mps:0', dtype=torch.float16)

In [8]:
model.resize_token_embeddings(len(tokenizer))

Embedding(151670, 1024)

In [9]:
model_dora.base_model.model.model.embed_tokens = model_dora.base_model.model.model.embed_tokens.float()

In [10]:
model_dora.base_model.model.model.embed_tokens, tokenizer_standard_tokens

(Embedding(151670, 1024), 151669)

In [11]:
model_dora.base_model.model.lm_head.weight[-293:-288], model_dora.base_model.model.lm_head.weight.shape[0] - 293

(tensor([[-0.0046, -0.0576,  0.0605,  ..., -0.0118, -0.0267,  0.0042],
         [ 0.0276, -0.0500,  0.0479,  ..., -0.0098, -0.0055, -0.0187],
         [ 0.0052, -0.0593,  0.0151,  ...,  0.0080,  0.0105,  0.0166],
         [ 0.0063, -0.0505,  0.0297,  ..., -0.0197, -0.0007,  0.0067],
         [ 0.0192, -0.0581,  0.0393,  ..., -0.0009, -0.0107,  0.0258]],
        device='mps:0'),
 151377)

In [12]:
for name, params in model_dora.named_parameters():
    print(f"{name}: {params.requires_grad}")

base_model.model.model.embed_tokens.weight: False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight: False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: True
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector.default.weight: True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight: False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: True
base_model.model.model.layers.0.self_attn.k_proj.lora_magnitude_vector.default.weight: True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight: False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: True
base_model.model.model.layers.0.self_attn.v_proj.lora_magnitude_vector.default.wei

In [16]:
model_dora.base_model.model.lm_head.weight.requires_grad = True

In [17]:
def zero_out_old_token_grads(grad):
    new_grad = grad.clone()
    new_grad[:tokenizer_standard_tokens, :] = 0.0
    return new_grad

model_dora.base_model.model.model.embed_tokens.weight.requires_grad = True
model_dora.base_model.model.model.embed_tokens.weight.register_hook(zero_out_old_token_grads)

<torch.utils.hooks.RemovableHandle at 0x321010e90>

In [22]:
model_dora.base_model.model.model.embed_tokens.weight[-5:]

tensor([[ 0.0034, -0.1084,  0.0204,  ..., -0.0077,  0.0016, -0.0035],
        [ 0.0050, -0.1035,  0.0209,  ...,  0.0030, -0.0085, -0.0027],
        [ 0.0022, -0.1118,  0.0204,  ...,  0.0018, -0.0019, -0.0084],
        [ 0.0013, -0.1099,  0.0286,  ..., -0.0013, -0.0171,  0.0019],
        [ 0.0061, -0.0928,  0.0251,  ...,  0.0051, -0.0082, -0.0054]],
       device='mps:0', grad_fn=<SliceBackward0>)

In [24]:
model_dora.base_model.model.model.embed_tokens.weight[-5:]

tensor([[ 0.0034, -0.1084,  0.0204,  ..., -0.0077,  0.0016, -0.0035],
        [ 0.0050, -0.1035,  0.0209,  ...,  0.0030, -0.0085, -0.0027],
        [ 0.0022, -0.1118,  0.0204,  ...,  0.0018, -0.0019, -0.0084],
        [ 0.0013, -0.1099,  0.0286,  ..., -0.0013, -0.0171,  0.0019],
        [-0.0043, -0.0822,  0.0172,  ..., -0.0091, -0.0062, -0.0012]],
       device='mps:0', grad_fn=<SliceBackward0>)

In [23]:
# 6. Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,          # How many times to read the docs
    per_device_train_batch_size=4, 
    gradient_accumulation_steps=2,
    learning_rate=1e-3,
    fp16=True,                   # Use mixed precision
    logging_steps=2,
    optim="adamw_torch",   # Saves memory
    save_strategy="epoch",       # Save a checkpoint every epoch
)

# 7. Initialize Trainer
trainer = SFTTrainer(
    model=model_dora,
    train_dataset=dataset,
    args=training_args,
    processing_class=tokenizer,
)

# 8. Train & Save
print("Starting training...")
trainer.train()

print(f"Saving model to {OUTPUT_DIR}...")
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Done!")


The model is already on multiple devices. Skipping the move to device specified in `args`.


Starting training...




Step,Training Loss
2,1.1807
4,1.0232
6,1.0726
8,1.0029
10,1.0408
12,0.8805
14,0.8574
16,0.9508
18,0.9858
20,1.1046




Saving model to /Users/martin/Documents/FRI/Workshops/LoRA-tutorial/notebooks/../models/orange_qa_finetuned_Qwen3-0.6B_DoRA_qkvogdu...




Done!
