In [None]:
from unsloth import FastVisionModel, is_bf16_supported
from transformers import AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch

# Initialize model and tokenizer
model, tokenizer = FastVisionModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-11B-Vision",
    max_seq_length = 2048,
    dtype = torch.bfloat16 if is_bf16_supported() else torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

# Load Radiology Mini dataset
dataset = load_dataset("unsloth/Radiology_mini", split = "train")

In [None]:

# Prepare instruction template
def format_instruction(example):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "You are an expert radiographer. Describe accurately what you see in this image."}
            ]
        },
        {
            "role": "assistant",
            "content": example["caption"]
        }
    ]
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

# Format dataset
dataset = dataset.map(format_instruction, remove_columns=["caption"])

# Configure training
model = FastVisionModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
)

# Training arguments
training_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    max_steps=300,
    learning_rate=2e-4,
    fp16=not is_bf16_supported(),
    bf16=is_bf16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    seed=42,
    output_dir="./radiology-finetuned",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    args=training_args,
)

# Start training
model.train()
trainer.train()
