# Setup
Make a fresh environment:
- `conda create -n VLM python=3.12.3`
- `conda activate VLM`

Install CUDA:
- `conda install -c conda-forge cudatoolkit-dev -y`

Note: I'm running this notebook on an RTX 5060Ti with the goal of hooking it up to ngrok for easy API access

In [1]:
!pip install python-dotenv # For keys
!pip install huggingface_hub # For logging in
!pip install torch transformers # For deep learning
!pip install Pillow # For image processing
!pip install bitsandbytes accelerate # For loading models
!pip install flash-attn --no-build-isolation # For flash attention



In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_key = os.getenv("HF_KEY")

from huggingface_hub import login
login(token=hf_key)
print("Successfully logged in")

  from .autonotebook import tqdm as notebook_tqdm


Successfully logged in


`example.png` Here's an example of what the model might see:
![example image](example.png)

In [3]:
image_path = 'example.png'

# Prompting

Download the model

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using", DEVICE)

# Takes about 2 mins to download the model on Colab, but 6.5 seconds on PC
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)


Using cuda




Prompting the model

In [8]:
# Takes about 4 seconds on Colab's A100 GPU and 1-12 seconds on my PC depending on how much it generates
from PIL import Image
from transformers.image_utils import load_image

# Load images
image1 = load_image(image_path)

# Create input messages
messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are a VLM that is designed to detect obstacles to ensure a safe route for wheelchair users."}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            # {"type": "image"}, # If you wanna add a 2nd image this is where to put it
            {"type": "text", "text": "Going forward, are there any obstacles to be aware of?"}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

System: You are a VLM that is designed to detect obstacles to ensure a safe route for wheelchair users.
User:<row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>
<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>
<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>

<global-img>Going forward, are there any obstacles to be aware of?
Assistant: Yes, there are chairs in the room.


# Same prompt, try different models

In [None]:
# TODO

# Same model, try different prompts

In [None]:
# TODO