In [5]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image


model_id = 'microsoft/Florence-2-base-ft'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().cuda()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def caption(task_prompt, image, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"].cuda(),
        pixel_values=inputs["pixel_values"].cuda(),
        max_new_tokens=1500,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )

    return parsed_answer



In [6]:
image = Image.open("./images.png").convert("RGB")

task_prompt = "<MORE_DETAILED_CAPTION>"

answer = caption(task_prompt=task_prompt, image=image)

print(answer)

{'<MORE_DETAILED_CAPTION>': 'A woman is sitting on a bed. She is wearing a green bra. She has long brown hair. There is a red blanket on the bed behind her. '}


In [5]:
import torch
print(torch.version.cuda)  # Kiểm tra phiên bản CUDA
print(torch.cuda.is_available())  # Kiểm tra xem CUDA có khả dụng không


11.8
True
