In [1]:
!pip install -q -U transformers==4.53.3 num2words #nuscenes-devkit

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for docopt (setup.py) ... [?25l[?25hdone


# Load the Model

In [1]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import num2words

model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)

for param in model.parameters():
    param.requires_grad = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
processor.image_processor.max_image_size["longest_edge"]= 384
processor.image_processor.do_image_splitting=False
processor.image_processor.do_resize=True

In [3]:
teacher = model.model.vision_model

# dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("MehdiJmlkh/nuscenes")

# get the labels

In [7]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": dataset['train'][0]['CAM_BACK_RIGHT']},
            {"type": "image", "url": dataset['train'][0]['CAM_BACK']},
            {"type": "image", "url": dataset['train'][0]['CAM_BACK']},
            {"type": "image", "url": dataset['train'][0]['CAM_BACK']},
            {"type": "image", "url": dataset['train'][0]['CAM_BACK']},
            {"type": "image", "url": dataset['train'][0]['CAM_BACK']},
            {"type": "text", "text": "describe this image."}
        ]
    }
]

inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)

output_ids = model.generate(**inputs, max_new_tokens=32)
generated_texts = processor.batch_decode(output_ids, skip_special_tokens=True)
generated_texts

['User:describe this image.\nAssistant: The image depicts a city street scene during a rainy day. The perspective is from inside a vehicle, likely a car, as evidenced by the windshield and the']

In [None]:
dataset['train'][0]

{'CAM_FRONT_LEFT': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>,
 'CAM_FRONT': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>,
 'CAM_FRONT_RIGHT': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>,
 'CAM_BACK_LEFT': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>,
 'CAM_BACK': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>,
 'CAM_BACK_RIGHT': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1600x900>}

In [10]:
import torch
from huggingface_hub import upload_file

repo_id = "ArianFiroozi/Nuscenes-Knowledge-Transfer"

def process_and_upload(start_idx, end_idx, batch_num):
    processed_samples = []

    for i in range(start_idx, end_idx):
        if i % 100 == 0:
            print(f"Processing sample {i}")
        image = dataset['train'][i]
        images = [image[cam] for cam in CAM_KEYS]
        pixel_values = torch.tensor(
            processor.image_processor(images).pixel_values
        ).view(6,3,384,384).to("cuda", dtype=torch.bfloat16)

        with torch.no_grad():
            embedding = teacher(pixel_values).last_hidden_state

        processed_samples.append({
            "nusc_idx": i,
            "data": pixel_values.cpu(),
            "label": embedding.cpu(),
        })

    local_path = f"batch_{batch_num}.pt"
    torch.save(processed_samples, local_path)

    upload_file(
        path_or_fileobj=local_path,
        path_in_repo=f"batches/{local_path}",
        repo_id=repo_id,
        repo_type="dataset"
    )


In [8]:
import huggingface_hub
huggingface_hub.login()

In [12]:
process_and_upload(1, 1, 1)

batch_1.pt:   0%|          | 0.00/864 [00:00<?, ?B/s]

In [16]:
from huggingface_hub import hf_hub_download
import torch

all_samples = []
for batch_num in range(2):
    path = hf_hub_download(
        repo_id=repo_id,
        filename=f"batches/batch_{batch_num}.pt",
        repo_type="dataset"
    )
    all_samples.extend(torch.load(path))


batch_0.pt:   0%|          | 0.00/15.4M [00:00<?, ?B/s]

batch_1.pt:   0%|          | 0.00/864 [00:00<?, ?B/s]