In [1]:
import os
from huggingface_hub import login

hf_token = os.environ['HF_TOKEN']
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import torch
import transformers

model_id = "google/gemma-3-4b-it"
model_path = "./agricsense.model"

In [3]:
from datasets import load_dataset

DATASET_BASE = "/home/junhee0110/dataset/images_split_{}"
JSON_BASE = "NewQA_split_{}.json"
SPLIT_NUMBER = 1


get_dataset_dir = lambda index: DATASET_BASE.format(index)
get_json_dir = lambda index: os.path.join(DATASET_BASE.format(index), JSON_BASE.format(index))

dataset = load_dataset("json", data_files=get_json_dir(SPLIT_NUMBER), split="train")

get_image_path = lambda index, is_depth: os.path.join(get_dataset_dir(SPLIT_NUMBER), dataset[index]["image"][is_depth])

In [None]:
from PIL import Image

Image.open(get_image_path(7779, 0))

In [5]:
from transformers import AutoProcessor, AutoModelForImageTextToText

model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)

processor = AutoProcessor.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [6]:
from pipeline import generate_prompt, collate_data, generate

prompt = [
    generate_prompt("Which tree is the closest one?", get_image_path(0,0), get_image_path(0,1)),
    generate_prompt("Describe the image", get_image_path(0,0), get_image_path(0,1)),
    generate_prompt("How many trees do you see?", get_image_path(0,0), get_image_path(0,1)),
]

generated = generate(prompt, model, processor, max_new_tokens=400)

[{'messages': [{'role': 'user', 'content': [{'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome/3638.jpg'}, {'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome_d/3638.png'}, {'type': 'text', 'text': 'Which tree is the closest one?'}]}]}, {'messages': [{'role': 'user', 'content': [{'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome/3638.jpg'}, {'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome_d/3638.png'}, {'type': 'text', 'text': 'Describe the image'}]}]}, {'messages': [{'role': 'user', 'content': [{'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome/3638.jpg'}, {'type': 'image', 'path': '/home/junhee0110/dataset/images_split_1/visual_genome_d/3638.png'}, {'type': 'text', 'text': 'How many trees do you see?'}]}]}]


In [8]:
print(generated)

['The tree closest to the camera is the one on the right side of the image.', 'The image shows a body of water with trees on both sides. The trees on the left side are closer to the camera, while those on the right are further away. There is a fence running parallel to the water on both sides, and a building is visible in the background on the right side.', 'There are three trees in the image.']
