### FineTune Llama-3.2-11B-Vision-Instruct using unsloth

Improve the result's from `simple_img_to_markdown.ipynb`.

Taken from: https://colab.research.google.com/drive/1j0N4XTY1zXXy7mPAhOC1_gMYZ2F2EBlk?usp=sharing#scrollTo=2eSvM9zX_2d3  
(Custom dataset generated)

In [1]:
%%capture
from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

from transformers import TextStreamer

from trl import SFTTrainer, SFTConfig

In [2]:
import torch
print(f"Cuda support: {torch.cuda.is_available()}, Version: {torch.version.cuda}")

Cuda support: True, Version: 12.1


In [3]:
%%capture

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [4]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [11]:
from notebooks.training_data.llama_vision_finetune.data_gen import create_training_data, instruction

texts, images, outputs = create_training_data()
print(f"Dataset size: {len(texts)}")

Dataset size: 10


In [12]:
def convert_to_conversation(idx: int):
    conversation = [
        {
            "role": "user",
            "content" : [
                {"type" : "text",  "text"  : instruction.format(full_text=texts[idx])},
                {"type" : "image", "image" : f"page_{idx}.png"}
            ]
        },
        {
            "role" : "assistant",
            "content" : [
                {"type" : "text",  "text"  : outputs[idx]}
            ]
        },
    ]
    return { "messages" : conversation }

dataset = [convert_to_conversation(i) for i in texts]

In [14]:
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
from notebooks.training_data.llama_vision_finetune.data_gen import get_test_data

image, instruction_text, test_idx = get_test_data()
print(f"Test Index: {test_idx}")

In [None]:
FastVisionModel.for_inference(model)

messages = [
    {
        "role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction_text}
        ]
    }
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 0.5, min_p = 0.1)