In [None]:
!pip install -q num2words

In [None]:
from datasets import load_dataset

drive_lm = load_dataset("MehdiJmlkh/DriveLM")
nuscenes = load_dataset("MehdiJmlkh/nuscenes")

In [None]:
import json
from datasets import Dataset


class DriveLM:
    def __init__(self, drive_lm, nuscenes):
        self.drive_lm = drive_lm
        self.nuscenes = nuscenes

    def __getitem__(self, idx):
      sample = self.drive_lm[idx]
      nuscenes_index = sample["nuscenes_index"]
      sample['images'] = self.nuscenes[nuscenes_index]

      return sample

    def __len__(self):
        return len(self.drive_lm)

dataset = {
    "train": DriveLM(drive_lm["train"].select(range(1000)), nuscenes["train"]),
    "test": DriveLM(drive_lm["test"], nuscenes["test"])
}

In [2]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import num2words
import torch
import torch.nn as nn
from transformers import ViTModel, AutoImageProcessor
from transformers.modeling_outputs import BaseModelOutput
import matplotlib.pyplot as plt
import numpy as np
from functools import partial
from torchvision.transforms import functional as F_transforms
from huggingface_hub import PyTorchModelHubMixin

# Load Dataset and Model

In [None]:
model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)

processor.image_processor.max_image_size["longest_edge"]= 384
processor.image_processor.do_image_splitting=False
processor.image_processor.do_resize=True

for param in model.parameters():
    param.requires_grad = False

# Fine-tune

In [9]:
def display_output(inputs, outputs, loss, global_step):
    logits = outputs.logits  # (batch, seq_len, vocab_size)
    pred_ids = torch.argmax(logits, dim=-1)  # (batch, seq_len)

    labels = inputs["labels"].clone()

    mask = labels != -100
    pred_ids_filtered = [p[m].tolist() for p, m in zip(pred_ids, mask)]
    label_ids_filtered = [l[m].tolist() for l, m in zip(labels, mask)]

    pred_text = processor.tokenizer.batch_decode(pred_ids_filtered, skip_special_tokens=True)
    label_text = processor.tokenizer.batch_decode(label_ids_filtered, skip_special_tokens=True)

    print(f"Step: {global_step}, Loss: {loss: .4}")
    print("Prediction:", pred_text[0])
    print("Target:", label_text[0], end='', sep='')
    print("-" * 50)

In [10]:
epochs = 1
batch_size = 1
learning_rate = 2e-5
save_steps = 10
logging_steps = 10

In [11]:
from transformers import Trainer
from transformers.trainer import _is_peft_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

class OutputDisplayTrainer(Trainer):
    global_step = 1
    logging_steps = logging_steps
    def compute_loss(
        self,
        model: nn.Module,
        inputs,
        return_outputs: bool = False,
        num_items_in_batch= None,
    ):

        if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        if self.model_accepts_loss_kwargs:
            kwargs = {}
            if num_items_in_batch is not None:
                kwargs["num_items_in_batch"] = num_items_in_batch
            inputs = {**inputs, **kwargs}
        outputs = model(**inputs)

        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            unwrapped_model = self.accelerator.unwrap_model(model)
            if _is_peft_model(unwrapped_model):
                model_name = unwrapped_model.base_model.model._get_name()
            else:
                model_name = unwrapped_model._get_name()

            if self.compute_loss_func is not None:
                loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch)
            elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )

            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        if (
            self.args.average_tokens_across_devices
            and (self.model_accepts_loss_kwargs or self.compute_loss_func)
            and num_items_in_batch is not None
        ):
            loss *= self.accelerator.num_processes

        if OutputDisplayTrainer.global_step % OutputDisplayTrainer.logging_steps == 0:
            display_output(inputs, outputs, loss, OutputDisplayTrainer.global_step)
        OutputDisplayTrainer.global_step += 1

        return (loss, outputs) if return_outputs else loss


In [12]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

0

In [13]:
from peft import LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments
from peft import PeftModel

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules="all-linear",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, peft_config)

# peft_model = PeftModel.from_pretrained(model, "MehdiJmlkh/SmolDriver-Peft", is_trainable=True)

for param in peft_model.model.model.vision_model.parameters():
    param.requires_grad = True

In [None]:
from torch.utils.data import Dataset

class VLMQADataset(Dataset):
    def __init__(self, dataset: DriveLM, is_train=True):
        self.dataset = dataset
        self.is_train = is_train

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        user_msg = []
        cameras = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT']
        for camera in cameras:
            user_msg.extend([
                {"type": "text", "text": camera},
                {"type": "image", "image": item["images"][camera]}
            ])

        user_msg.append({
            "type": "text",
            "text":f"Scene description:{item['scene_description']} Question: {item['question']}"
        })

        assistant_msg = [{"type": "text", "text": item["answer"]}]

        if not self.is_train:
            conversation = [
                {
                    "role": "user",
                    "content": user_msg
                }
            ]
            return self.__apply_chat_template(conversation, True)


        conversation = [
            {
                "role": "user",
                "content": user_msg
            },
            {
                "role": "assistant",
                "content": assistant_msg
            }
        ]
        inputs = self.__apply_chat_template(conversation)

        label_start_idx = self.__get_label_start_idx(inputs)

        labels = inputs['input_ids'].clone()
        labels[:, :label_start_idx] = -100
        inputs['labels'] = labels

        return inputs

    def __apply_chat_template(self, conversation, add_generation_prompt=False):
        return processor.apply_chat_template(
            conversation,
            add_generation_prompt=add_generation_prompt,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        )

    def __get_label_start_idx(self, inputs):
        utterance_id = processor.tokenizer.convert_tokens_to_ids('<end_of_utterance>')
        utterance_idx = inputs['input_ids'][0].tolist().index(utterance_id)
        num_assistant_ids = 4
        label_idx = utterance_idx + num_assistant_ids + 1

        return label_idx

In [15]:
from transformers import Trainer, TrainingArguments

train_dataset = VLMQADataset(dataset["train"])

training_args = TrainingArguments(
    output_dir="./fine-tune-checkpoints",
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs,
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    bf16=True,
    report_to="none",
    label_names=["labels"],
    gradient_checkpointing=True,
    save_total_limit=2,
    # save_strategy="steps",
    # save_steps=save_steps,
    seed=42,
    data_seed=42,
)


def pad_labels(labels):
    pad_token_label_id = -100
    max_len = max(len(label) for label in labels)

    padded_labels = torch.stack([
        torch.cat([torch.full((max_len - len(label),), pad_token_label_id), label])
        for label in labels
    ])

    return padded_labels


def data_collator(features):
    # return features[0]

    features_flatten = []
    labels = []
    pixel_values = []
    for feature in features:
        labels.append(feature['labels'].flatten())
        pixel_values.append(feature['pixel_values'])
        del feature['labels']
        del feature['pixel_values']

        for key, item in feature.items():
            feature[key] = item[0]
        features_flatten.append(feature)

    batch = processor.tokenizer.pad(features_flatten, padding=True, return_tensors="pt")
    batch['labels'] = pad_labels(labels)
    batch['pixel_values'] = torch.concatenate(pixel_values)

    return batch

trainer = OutputDisplayTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,1.6927
20,1.0944
30,1.4921
40,1.0413
50,0.5031
60,1.5931
70,0.8181
80,1.0549
90,0.9519
100,1.3879


Step: 10, Loss:  2.063
Prediction:  cars are stopped forward
R
Target: Three cars are moving.
--------------------------------------------------
Step: 20, Loss:  0.8093
Prediction: ,
R
Target: No.
--------------------------------------------------
Step: 30, Loss:  2.251
Prediction:  are a vehicle vehicle, the of the ego vehicle, There
Ass
Target: There is one moving car in front of the ego car.
--------------------------------------------------
Step: 40, Loss:  0.2881
Prediction: .
Ass
Target: Yes.
--------------------------------------------------
Step: 50, Loss:  0.6013
Prediction:  pedestrian is the front right of the ego car is standing.
Ass
Target: The pedestrian to the front right of the ego car is moving.
--------------------------------------------------
Step: 60, Loss:  2.096
Prediction:  ego vehicle will likely to. answer vehicle is going going.
Ass
Target: The ego vehicle is going straight. The ego vehicle is not moving.
--------------------------------------------------
Ste

Step,Training Loss
10,1.6927
20,1.0944
30,1.4921
40,1.0413
50,0.5031
60,1.5931
70,0.8181
80,1.0549
90,0.9519
100,1.3879


TrainOutput(global_step=1000, training_loss=0.6481156706809997, metrics={'train_runtime': 11973.8437, 'train_samples_per_second': 0.084, 'train_steps_per_second': 0.084, 'total_flos': 7762815968537760.0, 'train_loss': 0.6481156706809997, 'epoch': 1.0})

# save and load

In [30]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
peft_model.push_to_hub("ArianFiroozi/SmolVLM-Peft")

adapter_model.safetensors:   0%|          | 0.00/108M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ArianFiroozi/SmolVLM-Peft/commit/ffb92286c93cd79ac5b5e37183be278f99fbfbb1', commit_message='Upload model', commit_description='', oid='ffb92286c93cd79ac5b5e37183be278f99fbfbb1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArianFiroozi/SmolVLM-Peft', endpoint='https://huggingface.co', repo_type='model', repo_id='ArianFiroozi/SmolVLM-Peft'), pr_revision=None, pr_num=None)

In [None]:
from peft import PeftModel


peft_model = PeftModel.from_pretrained(model, "ArianFiroozi/SmolVLM-Peft", is_trainable=False)

# for param in peft_model.model.model.vision_model.parameters():
#     param.requires_grad = True

# Test

In [None]:
test_dataset = VLMQADataset(dataset["test"], is_train=False)

In [None]:
from tqdm import tqdm

test_results = []

for index in tqdm(range(0, len(dataset["test"]), 10), desc="Generating predictions"):
    sample = dataset["test"][index]

    output = peft_model.generate(
        **test_dataset[index].to("cuda", dtype=torch.bfloat16),
        max_new_tokens=32
    )
    generated_texts = processor.batch_decode(output, skip_special_tokens=True)
    pred = generated_texts[0].split("Assistant: ")[-1]

    test_results.append({
        "scene_description": sample["scene_description"],
        "nuscenes_index": sample["nuscenes_index"],
        "task": sample["task"],
        "question": sample["question"],
        "answer": sample["answer"],
        "prediction": pred
    })

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
from datasets import Dataset

hf_dataset = Dataset.from_list(test_results)
hf_dataset.push_to_hub(
    "MehdiJmlkh/SmolVLM-FT-Results",
    private=False,
    commit_message="Save results"
)