In [None]:
import os
import torch
from PIL import Image
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import MllamaForConditionalGeneration, AutoProcessor

load_dotenv()

login(token=os.getenv("HF_TOKEN"))

In [None]:
HF_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision"
EVALUATION_IMAGES_DIR = "/home/ubuntu/meta_hackathon_oslo/evaluation/handwritten"
MAX_NEW_TOKENS = 100

In [None]:
model = MllamaForConditionalGeneration.from_pretrained(
    HF_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(HF_MODEL_ID)

In [None]:
def inference(image_url):
    image = Image.open(image_url)
    prompt = "<|image|><|begin_of_text|>Please summarise the contents of this image."

    inputs = processor(image, prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

    return processor\
        .decode(output[0])\
        .replace(prompt, "")\
        .replace("<|begin_of_text|>", "")

In [None]:
directory = Path(EVALUATION_IMAGES_DIR)

for image_path in directory.iterdir():
    inference_result = inference(image_path)
    print(inference_result)