In [None]:
!pip install -q -U transformers==4.53.3 num2words

In [None]:
!pip install nuscenes-devkit

# Load the Model

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import num2words

model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)

for param in model.parameters():
    param.requires_grad = False

In [None]:
processor.image_processor.max_image_size["longest_edge"]= 384
processor.image_processor.do_image_splitting=False
processor.image_processor.do_resize=True

# dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("MehdiJmlkh/nuscenes")

# get the labels

In [None]:
import torch
from datasets import Dataset
from PIL import Image

CAM_KEYS = [
    "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT",
    "CAM_FRONT", "CAM_FRONT_LEFT", "CAM_FRONT_RIGHT"
]

processed_samples = []

for i in range(len(dataset)):
    for cam in CAM_KEYS:
        image = dataset[i]["images"][cam]
        pixel_values = torch.tensor(processor.image_processor(image).pixel_values)

        with torch.no_grad():
            embedding = model.model.vision_model(pixel_values).last_hidden_state

        processed_samples.append({
            "data": pixel_values,
            "label": embedding,
            "image": image,
            "position": cam
        })

In [None]:
hf_dataset = Dataset.from_list(processed_samples)

hf_dataset.push_to_hub("ArianFiroozi/DriveLM-Knowledge-Transfer")