# Fine Tuning an LLM

## Load and Preprocess the dataset

In [1]:
from datasets import load_dataset

from lib.preprocess import get_as_messages


ds = load_dataset("Biddls/Onion_News")
ds = ds.map(get_as_messages, remove_columns=['text'])
ds['train'][:5]

Map:   0%|          | 0/33880 [00:00<?, ? examples/s]

## Load the baseline model

In [2]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

torch.random.manual_seed(0)

model_path = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype="auto",
)

new_chat_template = \
    "{% for message in messages %}"\
        "{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '<|im_start|>assistant\\n' }}"\
    "{% endif %}"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.chat_template = new_chat_template

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training


lora_rank = 16

peft_config = LoraConfig(
    r=lora_rank,
    lora_alpha=2 * lora_rank,
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
)

prepared_model = prepare_model_for_kbit_training(model)

In [4]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir='./finetunes/01-transformers-finetune',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=10,
    report_to="none",
    optim='paged_adamw_8bit',
    lr_scheduler_type='linear',
    warmup_steps=5,
    seed=42,
)

trainer = SFTTrainer(
    model=prepared_model,
    args=training_args,
    train_dataset=ds['train'],
    peft_config=peft_config,
    processing_class=tokenizer,
)

Tokenizing train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

In [5]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,4.2222
20,1.9747
30,1.7155
40,1.6674
50,1.7152
60,1.6277
70,1.6778
80,1.6829
90,1.6471
100,1.5985


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=100, training_loss=1.9528952598571778, metrics={'train_runtime': 85.1752, 'train_samples_per_second': 9.392, 'train_steps_per_second': 1.174, 'total_flos': 768565884641280.0, 'train_loss': 1.9528952598571778})