# Fine tune Gemma-3-270M-it model using Axolotl framework

In [1]:
# Check if GPU is available
import torch
print('GPU available?', torch.cuda.is_available())
print('BF16 is supported?', torch.cuda.is_bf16_supported())

GPU available? True
BF16 is supported? True


In [2]:
!printenv CUDA_HOME

/appl/easybuild/opt/CUDA/12.6.0


In [3]:
# set model name etc.

MODEL_NAME = "google/gemma-3-270m-it"
MODEL_SHORT_NAME = MODEL_NAME.split('/')[-1]
SUFFIX = "FinGreyLit"
#SLICE = 1

In [4]:
# Load and prepare fine-tuning dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of test set

train_files = glob.glob("../../llm-dataset/*-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

EVAL_SIZE = 32  # how many documents to evaluate (i.e. calculate loss) on during fine-tuning
SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def preprocess_sample(sample):
    output = json.dumps(sample["ground_truth"])
    input_ = json.dumps(sample["content"])
    # ShareGPT format
    conversations = [
        {'from': 'system', 'value': SYSTEM_PROMPT},
        {'from': 'user', 'value': INSTRUCTION + "\n\n" + input_},
        {'from': 'assistant', 'value': output}
    ]
    return {"conversations": conversations}

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records

def write_jsonl(records, filename):
    with open(filename, "w") as outfile:
        for record in records:
            json.dump(record, outfile)
            outfile.write("\n")

train_recs = dataset_to_records(train_files)
random.shuffle(train_recs)
write_jsonl(train_recs, "axolotl-train.jsonl")
print(f"Wrote {len(train_recs)} train records")

test_recs = dataset_to_records(test_files)
write_jsonl(test_recs, "axolotl-test.jsonl")
print(f"Wrote {len(test_recs)} test records")

eval_recs = random.sample(test_recs, EVAL_SIZE)
write_jsonl(eval_recs, "axolotl-eval.jsonl")
print(f"Wrote {len(eval_recs)} eval records")

Wrote 640 train records
Wrote 182 test records
Wrote 32 eval records


In [5]:
# Create Axolotl configuration file

CONFIG_FILE = f"config-{MODEL_SHORT_NAME}.yml"


CONFIG = f"""
base_model: {MODEL_NAME}
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_8bit: false
load_in_4bit: false
strict: false

datasets:
  - path: axolotl-train.jsonl
    type: chat_template
    ds_type: json
    split: train
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

test_datasets:
  - path: axolotl-eval.jsonl
    type: chat_template
    ds_type: json
    split: train
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

output_dir: ./out-{MODEL_SHORT_NAME}

chat_template: gemma3
eot_tokens:
  - <end_of_turn>

peft_use_dora: true
adapter: lora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

sequence_len: 4096
sample_packing: true
eval_sample_packing: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
eval_batch_size: 2
num_epochs: 8
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false

gradient_checkpointing: true  # true: saves VRAM but is slower to train
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_steps: 10
evals_per_epoch: 2
eval_table_size:
eval_table_max_new_tokens: 128
saves_per_epoch: 1
debug:
weight_decay: 0.0
fsdp:
fsdp_config:


""".strip()

with open(CONFIG_FILE, 'w') as outfile:
    print(CONFIG, file=outfile)

In [6]:
%%time

!venv/bin/accelerate launch -m axolotl.cli.train {CONFIG_FILE}

The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `1`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
[2025-10-07 09:16:25,295] [INFO] [axolotl.utils.schemas.config.check_eval_packing:756] [PID:3056496] [RANK:0] setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match[39m
[2025-10-07 09:16:25,296] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:539] [PID:3056496] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
[2025-10-07 09:16:25,547] [INFO] [axolotl.utils.config.log_gpu_memory_usage:107] [PID:3056496] [RANK:0] cuda memory usage baseline: 0.000GB (+0.818GB misc)[39m

     #@@ #@@      @@# @@#
    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
    @@    #@

# Merge the LoRA/DoRA into the base model (for inference & quantization)

In [7]:
%%time

!venv/bin/axolotl merge-lora {CONFIG_FILE}

[2025-10-07 09:39:04,303] [INFO] [axolotl.utils.schemas.config.check_eval_packing:756] [PID:3059674] [RANK:0] setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match[39m
[2025-10-07 09:39:04,303] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:539] [PID:3059674] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
[2025-10-07 09:39:04,525] [INFO] [axolotl.utils.config.log_gpu_memory_usage:107] [PID:3059674] [RANK:0] cuda memory usage baseline: 0.000GB (+0.818GB misc)[39m

     #@@ #@@      @@# @@#
    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +

# Evaluate the merged model

In [8]:
%%time

# evaluate using the evaluate-model script, which needs venv with vLLM installed
!../dspy/venv/bin/python evaluate-model.py out-{MODEL_SHORT_NAME}/merged axolotl-test.jsonl results-{MODEL_SHORT_NAME}.md
!cat results-{MODEL_SHORT_NAME}.md

INFO 10-07 09:39:44 [__init__.py:216] Automatically detected platform cuda.
INFO 10-07 09:39:48 [utils.py:328] non-default args: {'max_model_len': 8192, 'disable_log_stats': True, 'model': 'out-gemma-3-270m-it/merged'}
INFO 10-07 09:40:00 [__init__.py:742] Resolved architecture: Gemma3ForCausalLM
`torch_dtype` is deprecated! Use `dtype` instead!
INFO 10-07 09:40:00 [__init__.py:1815] Using max model len 8192
INFO 10-07 09:40:01 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=3059841)[0;0m INFO 10-07 09:40:02 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=3059841)[0;0m INFO 10-07 09:40:02 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='out-gemma-3-270m-it/merged', speculative_config=None, tokenizer='out-gemma-3-270m-it/merged', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_