## Phi-3-Vision Model

##### For vision language model serving and inference I utilized vLLM locally and ran this command in a shell environment to start: vllm serve microsoft/Phi-3.5-vision-instruct --tensor-parallel-size=2 --disable-log-stats --disable-log-requests --trust-remote-code --max-model-len 12000

In [None]:
import torch
import gc
from openai import OpenAI
import base64
import re
import os
import time

In [None]:
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

def encode_image_base64(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string

def run_phi3v(image_dir, output_file):
    clear_gpu_memory()

    image_files = sorted((f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))), key = natural_sort_key)

    with open(output_file, 'w') as f:
        for image_file in image_files:
            i = image_dir + image_file
            image = encode_image_base64(i)
            openai_api_key = "EMPTY"
            openai_api_base = "http://localhost:8000/v1"
            client = OpenAI(api_key=openai_api_key,base_url=openai_api_base,)
            chat_response = client.chat.completions.create(
                model = "microsoft/Phi-3-vision-128k-instruct", 
                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": """<|system|>ROLE: Expert image analyst, capable of extracting text from images
                                                    TASK: Please extract all the text from the following image. The image contains various types of printed text. 
                                                    You must follow these Steps to extract and format the text: 
                                                    Step 1: Extracting
                                                    Extract the text from the image in the order in which it is designed to be read. 
                                                    Step 2: Formatting
                                                    Arrange the text into clean, easily readable blocks of text.<|end|>
                                                    <|user|>Can you please extract all the text from the following image?<|image_1|><|end|>\n<|assistant|>\n"""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{image}"
                            }
                        }
                    ]
                }],
            )
            
            f.write(chat_response.choices[0].message.content)

In [None]:
start_time = time.time()

for f in os.listdir('/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/'):
    image_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/' + f + '/'
    output_file = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3VTranscriptions/' + f + '.txt'
    run_phi3v(image_directory, output_file)    

end_time = time.time()
duration = end_time - start_time
print(duration)