In [1]:
# Import dependencies

import torch
import os
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
from transformers.image_utils import load_image

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [3]:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
).to(device)

processor = AutoProcessor.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from datasets import load_dataset
import matplotlib.pyplot as plt

train_dataset, eval_dataset = load_dataset("HuggingFaceM4/ChartQA", split=["train[:10%]", "val[:10%]"])
example = train_dataset[1]
image = load_image(example["image"])

print(example["query"])
print(example["label"][0])

How many values are below 40 in Unfavorable graph?
6


In [5]:
# Define a chat-style prompt
messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": example["query"]},
    ]}
]

# Apply the chat template
chat_prompt = processor.apply_chat_template(
      messages, add_generation_prompt=True\
)
print(chat_prompt)

<|im_start|>User:<image>How many values are below 40 in Unfavorable graph?<end_of_utterance>
Assistant:


In [6]:
# Tokenize input
inputs = processor(images=[image], text=chat_prompt, return_tensors="pt").to(device)

# Generate model output
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=20)

# Trim the generated ids to remove the input ids
trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)]

# Decode the output text
output_text = processor.batch_decode(
    trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

 6.


In [7]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

In [8]:
def format_data(sample):
    return {
        "images": [sample["image"]],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                    {
                        "type": "text",
                        "text": sample["query"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["label"][0]}],
            },
        ],
    }

In [9]:
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

In [10]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 2,568,192 || all params: 2,249,353,072 || trainable%: 0.1142


In [None]:
from trl import SFTConfig, SFTTrainer

# Configure training arguments using SFTConfig
training_args = SFTConfig(
    output_dir="smol-course-smolvlm2-2.2b-instruct-trl-sft-ChartQA",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    logging_steps=25,
    # save_strategy="steps",
    # save_steps=25,
    optim="adamw_torch_fused",
    bf16=True,
    # push_to_hub=True,
    report_to="trackio",
    max_length=None,
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
)

# Align the SFTTrainer params with your chosen dataset.



In [None]:
from huggingface_hub import login
login()

In [12]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(training_args.output_dir)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 49279, 'bos_token_id': 1, 'pad_token_id': 2}.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Kaori1707/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Kaori1707/trackio
* View dashboard by going to: https://Kaori1707-trackio.hf.space/


* Created new run: Kaori1707-1766988420


Step,Training Loss
25,18.1177
50,13.1142
75,3.1833
100,0.4256


KeyboardInterrupt: 