In [None]:
!pip install git+https://github.com/mozilla-ai/osm-ai-helper.git

In [None]:
from osm_ai_helper.download_osm import download_osm

In [None]:
AREA = "Galicia"

download_osm(
    area=AREA,
    output_dir="dataset",
    selector="leisure=swimming_pool",
    discard={"location": "indoor"},
)

In [None]:
from osm_ai_helper.group_elements_and_download_tiles import (
    group_elements_and_download_tiles,
)

In [None]:
import os

os.environ["MAPBOX_TOKEN"] = (
    "pk.eyJ1IjoiZGF2aWRkZWxhaWdsZXNpYSIsImEiOiJjajJpdnk2cnMwMXh3MnByN3pjcDJ3MXg1In0.U16sxCaYkCZ_LQAN0vRipg"
)

In [None]:
group_elements_and_download_tiles(
    f"dataset/{AREA}.json",
    f"dataset/{AREA}",
)

In [None]:
from osm_ai_helper.convert_to_vlm_dataset import convert_to_vlm_dataset

In [None]:
dataset = convert_to_vlm_dataset(
    f"dataset/{AREA}", "Point to the swimming pools in the image."
)

In [None]:
import random
from osm_ai_helper.utils.plots import show_vlm_entry

In [None]:
show_vlm_entry(dataset[random.randint(0, len(dataset))]["messages"])

In [None]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
dataset[4]["messages"][0]

In [None]:
dataset[9]["messages"][0]["content"][1]["image"]

In [None]:
from PIL import Image

FastVisionModel.for_inference(model) # Enable for inference!


image = Image.open("/content/dataset/Mondariz/Mondariz/18_124907_97086.jpg")
instruction = "Point to the swimming pools in the image."
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
show_vlm_entry(
    [{'role': 'user',
    'content': [{'type': 'text',
      'text': 'Point to the swimming pools in the image.'},
      {'type': 'image',
      'image': image}]},
  {'role': 'assistant',
    'content': [{'type': 'text',
      'text': '[(0.41, 0.43), (0.67, 0.57)]'}]}]
)