In [None]:
!python -m pip install --upgrade pip wheel setuptools

In [None]:
!pip install torch

In [None]:
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation

In [None]:
!pip install transformers timm

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
import copy

model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().cuda()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def run_example(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"].cuda(),
        pixel_values=inputs["pixel_values"].cuda(),
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )

    return parsed_answer

In [8]:
image = Image.open("img16.jpg").convert("RGB")

task_prompt = "<MORE_DETAILED_CAPTION>"
answer = run_example(task_prompt=task_prompt)

print(answer)

{'<MORE_DETAILED_CAPTION>': 'The image shows a young man standing on a sandy beach with a lake and mountains in the background. He is wearing a grey t-shirt, black shorts, and sunglasses, and has a backpack slung over his shoulder. He has a red hat in his left hand and is holding a pair of sunglasses in his right hand. The man is looking up at the sky with a slight smile on his face. The lake is calm and the water is a light blue color. There are trees and mountains visible in the distance. The sky is clear and blue.'}


In [9]:
import os

In [None]:
os.listdir('./images')

In [20]:
folder = './images'

list_of_img = os.listdir(folder)

for img in list_of_img:
    if img.endswith('.jpg'):
        file_path = (folder+'/'+img).split('.')[1]
        print(f'Captioning image: {file_path}')
        image_path = '.'+file_path+'.jpg'
        image = Image.open(image_path).convert("RGB")
        task_prompt = "<MORE_DETAILED_CAPTION>"
        answer = run_example(task_prompt=task_prompt)
        text_path = '.'+file_path+'.txt'
        with open(text_path, 'w') as f:
            f.write(answer['<MORE_DETAILED_CAPTION>'])
        

Captioning image: /images/img9
Captioning image: /images/img8
Captioning image: /images/img5
Captioning image: /images/img4
Captioning image: /images/img6
Captioning image: /images/img7
Captioning image: /images/img3
Captioning image: /images/img2
Captioning image: /images/img1
Captioning image: /images/img16
Captioning image: /images/img14
Captioning image: /images/img15
Captioning image: /images/img11
Captioning image: /images/img10
Captioning image: /images/img12
Captioning image: /images/img13
