## Phi-3.5-Vision Model

##### This model was released during the final week of this project, so this code is not nearly as refined and more specific but it follows the same structure as the Phi-3-Vision model without vLLM inference.

In [1]:
from PIL import Image 
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor 
import torch
import gc
import base64
import re
import os
import time

In [7]:
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

def encode_image_base64(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string

def run_phi35v(image_directory, output_file):
    clear_gpu_memory()

    model_id = "microsoft/Phi-3.5-vision-instruct"

    # Note: set _attn_implementation='eager' if you don't have flash_attn installed
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        device_map="cuda", 
        trust_remote_code=True, 
        torch_dtype="auto", 
        _attn_implementation='flash_attention_2'    
    )

    # For best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
    processor = AutoProcessor.from_pretrained(
        model_id, 
        trust_remote_code=True, 
        num_crops=4
    )

    images = []
    placeholder = ""
    count = 1

    image_files = sorted((f for f in os.listdir(image_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))), key = natural_sort_key)

    with open(output_file, 'w') as f:

        for image_file in image_files:
            image = image_directory + image_file
            images.append(Image.open(image))
            placeholder += f"<|image_{count}|>\n"
            count += 1

        messages = [
            {"role": "user", "content": f"""<|system|>{placeholder}ROLE: Expert image analyst, capable of extracting text from images
                                                            TASK: Please extract all the text from the following image. The image contains various types of printed text. 
                                                            You must follow these Steps to extract and format the text: 
                                                            Step 1: Extracting
                                                            Extract the text from the image in the order in which it is designed to be read. 
                                                            Step 2: Formatting
                                                            Arrange the text into clean, easily readable blocks of text.<|end|>
                                                            <|user|>Can you please extract all the text from the following image?<|end|>\n<|assistant|>\n"""},
        ]

        prompt = processor.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )

        inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")

        generation_args = { 
            "max_new_tokens": 1000, 
            "temperature": 0.0, 
            "do_sample": False, 
        }

        generate_ids = model.generate(
            **inputs, 
            eos_token_id=processor.tokenizer.eos_token_id, 
            **generation_args
        )

        # Remove input tokens 
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response = processor.batch_decode(generate_ids, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False)[0]

        # print(response)
        f.write(response)

In [8]:
model_id = "microsoft/Phi-3.5-vision-instruct"

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="cuda", 
    trust_remote_code=True, 
    torch_dtype="auto", 
    _attn_implementation='flash_attention_2'    
)

# For best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(
    model_id, 
    trust_remote_code=True, 
    num_crops=4
)

images = []
placeholder = ""
count = 1

# Path to the directory containing your local images
image_dir = "/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/Chapter1/"

image_files = sorted((f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))), key = natural_sort_key)

for image_file in image_files:
    image = image_dir + image_file
    images.append(Image.open(image))
    placeholder += f"<|image_{count}|>\n"
    count += 1

messages = [
    {"role": "user", "content": f"""<|system|>{placeholder}ROLE: Expert image analyst, capable of extracting text from images
                                                    TASK: Please extract all the text from the following image. The image contains various types of printed text. 
                                                    You must follow these Steps to extract and format the text: 
                                                    Step 1: Extracting
                                                    Extract the text from the image in the order in which it is designed to be read. 
                                                    Step 2: Formatting
                                                    Arrange the text into clean, easily readable blocks of text.<|end|>
                                                    <|user|>Can you please extract all the text from the following image?<|end|>\n<|assistant|>\n"""},
]

prompt = processor.tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)

inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")

generation_args = { 
    "max_new_tokens": 1000, 
    "temperature": 0.0, 
    "do_sample": False, 
}

generate_ids = model.generate(
    **inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
)

# Remove input tokens 
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0]

print(placeholder)
print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


<|image_1|>
<|image_2|>
<|image_3|>
<|image_4|>
<|image_5|>
<|image_6|>
<|image_7|>






In [None]:
output = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3.5VTranscriptions/Chapter25.txt'
run_phi35v("/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/Chapter25/", output)

In [5]:
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
import torch
import torch.nn as nn
import gc
import os
from PIL import Image
import re

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

def phi3_5v(image_dir, output_file):
    model_id = "/data/models/Phi-3.5-vision-instruct"
    
    # Clear GPU memory before initializing vLLM
    clear_gpu_memory()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Configure the model
    llm = LLM(
        model=model_id,
        trust_remote_code=True,
        max_num_seqs=1,
        max_model_len=32 * 1024
    )

    # Move model to GPU and wrap with DataParallel if multiple GPUs are available
    if torch.cuda.device_count() > 1:
        llm = nn.DataParallel(llm)
    llm = llm.to(device)

    image_files = sorted((f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))), key=natural_sort_key)

    images = []
    placeholder = ""
    count = 1

    with open(output_file, 'w') as f:
        for image_file in image_files:
            image = image_dir + image_file
            images.append(Image.open(image))
            placeholder += f"<|image_{count}|>\n"
            count += 1

        try:
            messages = [
            {"role": "user", "content": f"""<|system|>{placeholder}ROLE: Expert image analyst, capable of extracting text from images
                                                            TASK: Please extract all the text from the following image. The image contains various types of printed text. 
                                                            You must follow these Steps to extract and format the text: 
                                                            Step 1: Extracting
                                                            Extract the text from the image in the order in which it is designed to be read. 
                                                            Step 2: Formatting
                                                            Arrange the text into clean, easily readable blocks of text.<|end|>
                                                            <|user|>Can you please extract all the text from the following image?<|end|>\n<|assistant|>\n"""},
            ]

            prompt = processor.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )

            inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")

            generation_args = { 
                "max_new_tokens": 1000, 
                "temperature": 0.0, 
                "do_sample": False, 
            }

            generate_ids = model.generate(
                **inputs, 
                eos_token_id=processor.tokenizer.eos_token_id, 
                **generation_args
            )

            # Remove input tokens 
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
            response = processor.batch_decode(generate_ids, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False)[0]

            # print(response)
            f.write(response)

        except RuntimeError as e:
            if 'CUDA out of memory' in str(e):
                print(f"RuntimeError: {e}")
                clear_gpu_memory()
            else:
                raise e

In [9]:
start_time = time.time()

for f in os.listdir('/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/'):
    image_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/' + f + '/'
    output_file = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3.5VTranscriptions/' + f + '.txt'
    if os.path.isfile(output_file):
        continue
    else:    
        run_phi35v(image_directory, output_file)    

end_time = time.time()
duration = end_time - start_time
print(duration)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

1461.0784368515015
