## Florence-2 Model

In [1]:
import re
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
import gc
import os
import time

In [3]:
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

def split_text(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return "\n".join(sentences)

def run_florence2(image_dir, output_file):
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_path = '/data/models/Florence-2-base'
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch_dtype, trust_remote_code = True).to(device)
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code = True)

    image_files = sorted((f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))), key=natural_sort_key)

    with open(output_file, 'w') as f:
        for image_file in image_files:
            image_path = image_dir + '/' + image_file

            try:
                image = Image.open(image_path)
                prompt = "<OCR>"
                inputs = processor(text = prompt, images = image, return_tensors = "pt").to(device, torch_dtype)

                generated_ids = model.generate(
                    input_ids = inputs["input_ids"],
                    pixel_values = inputs["pixel_values"],
                    max_new_tokens = 1024,
                    do_sample = False,
                    num_beams = 3,   
                )

                generated_text = processor.batch_decode(generated_ids, skip_special_tokens = True)[0]
                readable_text = split_text(generated_text)
                f.write(readable_text)

            except RuntimeError as e:
                if 'CUDA out of memory' in str(e):
                    print(f"RuntimeError: {e}")
                else:
                    raise e

In [4]:
start_time = time.time()

for f in os.listdir('/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/'):
    image_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/' + f + '/'
    output_file = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestFlorence2Transcriptions/' + f + '.txt'
    run_florence2(image_directory, output_file)    

end_time = time.time()
duration = end_time - start_time
print(duration)



4193.674163341522
