# First Implementation Of Gemma in PathVQA


In [1]:
from huggingface_hub import login
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from pathlib import Path
from datasets import load_from_disk
from PIL import Image
import requests
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("CUDA verfügbar:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Keine GPU")

CUDA verfügbar: True
GPU Name: Tesla T4


In [None]:
model_id = "google/gemma-3-4b-it"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)

Loading checkpoint shards:  50%|███████████████████████████████████████████████                                               | 1/2 [00:08<00:08,  8.73s/it]

In [None]:
project_root = Path.cwd().parent
data_path = project_root / "data"  / "train"
 
dataset = load_from_disk(str(data_path))
sample = dataset[1]  
image = sample["image"] 
question = sample["question"]
answer = sample["answer"]

In [None]:
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": question}
        ]
    }
]

inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]

decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)

# **Overall Impression:** The image is a close-up shot of a vibrant garden scene, 
# focusing on a cluster of pink cosmos flowers and a busy bumblebee. 
# It has a slightly soft, natural feel, likely captured in daylight.
