Kernel: huggingface1

Ref: https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb

#### Imports

In [None]:
import torch

In [None]:
# Check if CUDA is available  
cuda_available = torch.cuda.is_available()  
  
print("Is CUDA available? ", cuda_available)  

# Print CUDA version  
print("CUDA version:", torch.version.cuda)  
  
# If CUDA is available, print the number of GPUs and their names  
if cuda_available:  
    print("Number of GPUs available: ", torch.cuda.device_count())  
    for i in range(torch.cuda.device_count()):  
        print("GPU ", i, ": ", torch.cuda.get_device_name(i))

In [None]:
# MK
device = 'cpu'

if cuda_available:
    device='cuda'
    print(f'device:{device}')

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM  
from PIL import Image
import requests
import copy
%matplotlib inline

In [None]:
# model_id = 'microsoft/Florence-2-base'
# model_id = 'microsoft/Florence-2-large'
model_id = 'microsoft/Florence-2-base-ft'
# model_id = 'microsoft/Florence-2-large-ft'
print(f'model_id:{model_id}')

model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

In [None]:
model = model.to(device)

#### The prediction function

In [None]:
def run_example(task_prompt, text_input=None):   
    
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    
    # MK
    # Move the Input Data to GPU
    if device == 'cuda':
        inputs = {k: v.to(device) for k, v in inputs.items()}  
         
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    
    parsed_answer = processor.post_process_generation(
        generated_text, 
        task=task_prompt, 
        image_size=(image.width, image.height)
    )

    return parsed_answer

#### Plot Functions

In [None]:
import matplotlib.pyplot as plt  
import matplotlib.patches as patches  

def plot_bbox(image, data):
   # Create a figure and axes  
    fig, ax = plt.subplots()  
      
    # Display the image  
    ax.imshow(image)  
      
    # Plot each bounding box  
    for bbox, label in zip(data['bboxes'], data['labels']):  
        # Unpack the bounding box coordinates  
        x1, y1, x2, y2 = bbox  
        # Create a Rectangle patch  
        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor='r', facecolor='none')  
        # Add the rectangle to the Axes  
        ax.add_patch(rect)  
        # Annotate the label  
        plt.text(x1, y1, label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))  
      
    # Remove the axis ticks and labels  
    ax.axis('off')  
      
    # Show the plot  
    plt.show()

In [None]:
# MK
from PIL import Image  
import matplotlib.pyplot as plt  
import matplotlib.patches as patches  
import re  
  
def plot_normalized_bbox(image, bbox_data):  
    # Create a figure and axes  
    fig, ax = plt.subplots()  
      
    # Display the image  
    ax.imshow(image)  
      
    # Get image dimensions  
    img_width, img_height = image.size  
      
    # Parse the normalized bounding box coordinates  
    bboxes = re.findall(r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>", bbox_data)  
      
    # Convert normalized coordinates to absolute coordinates and plot the rectangles  
    for bbox in bboxes:  
        # Normalize coordinates and convert to float  
        x1, y1, x2, y2 = [float(coord)/1000 for coord in bbox]  
          
        # Convert to absolute coordinates  
        abs_x1, abs_y1 = x1 * img_width, y1 * img_height  
        abs_x2, abs_y2 = x2 * img_width, y2 * img_height  
          
        # Create a Rectangle patch  
        rect = patches.Rectangle((abs_x1, abs_y1), abs_x2 - abs_x1, abs_y2 - abs_y1, linewidth=1, edgecolor='r', facecolor='none')  
          
        # Add the rectangle to the Axes  
        ax.add_patch(rect)  
      
    # Remove the axis ticks and labels  
    ax.axis('off')  
      
    # Show the plot  
    plt.show

In [None]:
from PIL import Image, ImageDraw, ImageFont 
import random
import numpy as np

colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
def draw_polygons(image, prediction, fill_mask=False):  
    """  
    Draws segmentation masks with polygons on an image.  
  
    Parameters:  
    - image_path: Path to the image file.  
    - prediction: Dictionary containing 'polygons' and 'labels' keys.  
                  'polygons' is a list of lists, each containing vertices of a polygon.  
                  'labels' is a list of labels corresponding to each polygon.  
    - fill_mask: Boolean indicating whether to fill the polygons with color.  
    """  
    # Load the image  
   
    draw = ImageDraw.Draw(image)  
      
   
    # Set up scale factor if needed (use 1 if not scaling)  
    scale = 1  
      
    # Iterate over polygons and labels  
    for polygons, label in zip(prediction['polygons'], prediction['labels']):  
        color = random.choice(colormap)  
        fill_color = random.choice(colormap) if fill_mask else None  
          
        for _polygon in polygons:  
            _polygon = np.array(_polygon).reshape(-1, 2)  
            if len(_polygon) < 3:  
                print('Invalid polygon:', _polygon)  
                continue  
              
            _polygon = (_polygon * scale).reshape(-1).tolist()  
              
            # Draw the polygon  
            if fill_mask:  
                draw.polygon(_polygon, outline=color, fill=fill_color)  
            else:  
                draw.polygon(_polygon, outline=color)  
              
            # Draw the label text  
            draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)  
  
    # Save or display the image  
    #image.show()  # Display the image  
    display(image)

In [None]:
def convert_to_od_format(data):  
    """  
    Converts a dictionary with 'bboxes' and 'bboxes_labels' into a dictionary with separate 'bboxes' and 'labels' keys.  
  
    Parameters:  
    - data: The input dictionary with 'bboxes', 'bboxes_labels', 'polygons', and 'polygons_labels' keys.  
  
    Returns:  
    - A dictionary with 'bboxes' and 'labels' keys formatted for object detection results.  
    """  
    # Extract bounding boxes and labels  
    bboxes = data.get('bboxes', [])  
    labels = data.get('bboxes_labels', [])  
      
    # Construct the output format  
    od_results = {  
        'bboxes': bboxes,  
        'labels': labels  
    }  
      
    return od_results  

In [None]:
def draw_ocr_bboxes(image, prediction):
    scale = 1
    draw = ImageDraw.Draw(image)
    bboxes, labels = prediction['quad_boxes'], prediction['labels']
    for box, label in zip(bboxes, labels):
        color = random.choice(colormap)
        new_box = (np.array(box) * scale).tolist()
        draw.polygon(new_box, width=3, outline=color)
        draw.text((new_box[0]+8, new_box[1]+2),
                    "{}".format(label),
                    align="right",
        
                    fill=color)
    display(image)

#### Initialise the input image

In [None]:
# from PIL import Image, ImageFile  
  
# # Allow loading of truncated images  
# ImageFile.LOAD_TRUNCATED_IMAGES = True  

In [None]:
use_own_image = True
image_path = '../test_images/maksssksksss0.png'
# image_path = '../test_images/test29.png'
print(f'use_own_image: {use_own_image}')

if not use_own_image:
    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    image = Image.open(requests.get(url, stream=True).raw)
else:    
    image = Image.open(image_path).convert('RGB')

In [None]:
image

#### Run pre-defined tasks without additional inputs

Caption

In [None]:
%%time
task_prompt = '<CAPTION>'
run_example(task_prompt)

In [None]:
%%time
task_prompt = '<DETAILED_CAPTION>'
run_example(task_prompt)

In [None]:
%%time

# task_prompt = 'what is person the right doing'
# task_prompt = 'how many people are wearing warm clothes'
# task_prompt = 'what are people in the background doing'
# task_prompt = 'how many people can be seen in the background'
# task_prompt = 'describe what is this place shown in the image'

q_list = ['What are people doing?',
          'Does this look like a photo taken indoors?',
          'Is this photo taken during the day or night?',
          'Are people carrying any items?']

for task_prompt in q_list:  

    results = run_example(task_prompt)
    print(results)

In [None]:
%%time
task_prompt = '<MORE_DETAILED_CAPTION>'
run_example(task_prompt)

Object Detection

OD results format: {'<OD>': { 'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['label1', 'label2', ...] } }

In [None]:
%%time
task_prompt = '<OD>'
results = run_example(task_prompt)
print(results)

In [None]:
plot_bbox(image, results['<OD>'])

Dense region caption

Dense region caption results format: {'<DENSE_REGION_CAPTION>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['label1', 'label2', ...]}}

In [None]:
%%time
task_prompt = '<DENSE_REGION_CAPTION>'
results = run_example(task_prompt)
print(results)

In [None]:
plot_bbox(image, results['<DENSE_REGION_CAPTION>'])

Region proposal

Region proposal results format: {'' : {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}

In [None]:
%%time
task_prompt = '<REGION_PROPOSAL>'
results = run_example(task_prompt)
print(results)

In [None]:
plot_bbox(image, results['<REGION_PROPOSAL>'])

#### Run pre-defined tasks that requires additional inputs

Phrase Grounding

Phrase grounding results format: {'<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}

In [None]:
%%time
task_prompt = '<CAPTION_TO_PHRASE_GROUNDING>'
results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
print(results)

In [None]:
plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])

Referring expression segmentation

Referring expression segmentation results format: {'<REFERRING_EXPRESSION_SEGMENTATION>': {'Polygons': [[[polygon]], ...], 'labels': ['', '', ...]}}, one object is represented by a list of polygons. each polygon is [x1, y1, x2, y2, ..., xn, yn]

In [None]:
%%time

task_prompt = '<REFERRING_EXPRESSION_SEGMENTATION>'
results = run_example(task_prompt, text_input="a green car")
print(results)

In [None]:
output_image = copy.deepcopy(image)
draw_polygons(output_image, results['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)  

region to segmentation

with additional region as inputs, format is '<loc_x1><loc_y1><loc_x2><loc_y2>', [x1, y1, x2, y2] is the quantized corrdinates in [0, 999].

In [None]:
%%time
task_prompt = '<REGION_TO_SEGMENTATION>'
results = run_example(task_prompt, text_input="<loc_702><loc_575><loc_866><loc_772>")
print(results)

In [None]:
output_image = copy.deepcopy(image)
draw_polygons(output_image, results['<REGION_TO_SEGMENTATION>'], fill_mask=True)  

Open vocabulary detection

open vocabulary detection can detect both objects and ocr texts.

results format:

{ '<OPEN_VOCABULARY_DETECTION>': {'bboxes': [[x1, y1, x2, y2], [x1, y1, x2, y2], ...]], 'bboxes_labels': ['label_1', 'label_2', ..], 'polygons': [[[x1, y1, x2, y2, ..., xn, yn], [x1, y1, ..., xn, yn]], ...], 'polygons_labels': ['label_1', 'label_2', ...] }}

In [None]:
%%time

task_prompt = '<OPEN_VOCABULARY_DETECTION>'
# results = run_example(task_prompt, text_input="a green car")
results = run_example(task_prompt, text_input="yellow wall")
print(results)

In [None]:
bbox_results  = convert_to_od_format(results['<OPEN_VOCABULARY_DETECTION>'])

In [None]:
plot_bbox(image, bbox_results)

region to texts

In [None]:
# MK
# Added to adjust the region coordinates as per your preference in the given image
custom_region = "<loc_320><loc_200><loc_450><loc_400>"
plot_normalized_bbox(image, custom_region)

In [None]:
%%time
task_prompt = '<REGION_TO_CATEGORY>'
if not use_own_image:
    results = run_example(task_prompt, text_input="<loc_52><loc_332><loc_932><loc_774>")
else:
    results = run_example(task_prompt, text_input=custom_region)
print(results)

In [None]:
%%time
task_prompt = '<REGION_TO_DESCRIPTION>'
if not use_own_image:
    results = run_example(task_prompt, text_input="<loc_52><loc_332><loc_932><loc_774>")
else:
    results = run_example(task_prompt, text_input=custom_region)
print(results)

ocr related tasks

In [None]:
url = "http://ecx.images-amazon.com/images/I/51UUzBDAMsL.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')

In [None]:
image

In [None]:
%%time
task_prompt = '<OCR>'
run_example(task_prompt)

In [None]:
%%time
task_prompt = '<OCR_WITH_REGION>'
results = run_example(task_prompt)
print(results)
# ocr results format
# {'OCR_WITH_REGION': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}

In [None]:
output_image = copy.deepcopy(image)
draw_ocr_bboxes(output_image, results['<OCR_WITH_REGION>'])  