In [1]:
from typing import Any, Dict, List, Optional

import requests
import torch
from lm_polygraph import estimate_uncertainty
from lm_polygraph.estimators import (
    EigValLaplacian,
    LexicalSimilarity,
    MaximumTokenProbability,
    PointwiseMutualInformation,
    SemanticEntropy,
)
from lm_polygraph.utils.model import BlackboxModel, WhiteboxModel
from PIL import Image

%load_ext autoreload
%autoreload 2
from transformers import (
    AutoModelForCausalLM,
    AutoModelForVision2Seq,
    AutoProcessor,
    AutoTokenizer,
)
from typing import List, Union
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


## VLM Starter code

In [2]:
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

prompt = "<grounding>An image of"

url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png"
image = Image.open(requests.get(url, stream=True).raw)

# The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs.
image.save("new_image.jpg")
image = Image.open("new_image.jpg")

inputs = processor(text=prompt, images=image, return_tensors="pt")

generated_ids = model.generate(
    pixel_values=inputs["pixel_values"],
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    image_embeds=None,
    image_embeds_position_mask=inputs["image_embeds_position_mask"],
    use_cache=True,
    max_new_tokens=20,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Specify `cleanup_and_extract=False` in order to see the raw model generation.
processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)

print(processed_text)
# `<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.`

# By default, the generated  text is cleanup and the entities are extracted.
processed_text, entities = processor.post_process_generation(generated_text)

print(processed_text)
# `An image of a snowman warming himself by a fire.`

<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.
An image of a snowman warming himself by a fire.


## Helper functions

In [3]:
from dataclasses import dataclass
from typing import Optional, Dict

@dataclass
class GenerationParameters:
    """
    Parameters to override in model generation.
    """
    temperature: float = 1.0
    top_k: int = 50
    top_p: float = 1.0
    do_sample: bool = False
    num_beams: int = 1
    presence_penalty: float = 0.0
    repetition_penalty: float = 1.0
    generate_until: tuple = ()
    allow_newlines: bool = True
    
class VLMWhiteboxModel(WhiteboxModel):
    def __init__(self, base_model: AutoModelForVision2Seq, processor: AutoProcessor, image_path: str = None, image_url: str = None, generation_parameters: Optional[Dict] = {}):
        self.model = base_model
        self.processor = processor
        self.tokenizer = processor.tokenizer
        self.model_type = "CausalLM"
        self.model_path = "microsoft/kosmos-2-patch14-224"

        # Load and store the image
        if image_url:
            self.image = Image.open(requests.get(image_url, stream=True).raw)
        elif image_path:
            self.image = Image.open(image_path)
        else:
            raise ValueError("Either image_path or image_url must be provided")
            
        # Save and reload image for consistency
        self.image.save("temp_image.jpg")
        self.image = Image.open("temp_image.jpg")
        self.generation_parameters = generation_parameters or GenerationParameters()
        

In [4]:
# Load model and processor
base_model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

# Create whitebox model with image
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png"
model = VLMWhiteboxModel(base_model, processor, image_url=url)

# Test with input text
input_text = "<grounding>What is the main object in this image?"

estimator = MaximumTokenProbability()
uncertainty = estimate_uncertainty(model, estimator, input_text=input_text)
print("Uncertainty estimation:", uncertainty)

Uncertainty estimation: UncertaintyOutput(uncertainty=array([-0.52349114, -0.98595804, -0.99339235, -0.9995575 , -0.9055974 ,
       -0.9999995 , -0.28288335, -0.89071596, -0.99985605, -0.80320007,
       -0.89930665, -0.55406886, -0.9966098 , -0.9489591 , -0.7710566 ,
       -0.2859758 , -0.35774505, -0.9161929 , -0.507055  , -0.31908163],
      dtype=float32), input_text='<grounding>What is the main object in this image?', generation_text='<phrase> The main object</phrase><object><patch_index_0044><patch_index_0863></object> in this scene is a snowman sitting by a campfire.', generation_tokens=[64007, 24, 815, 3812, 64008, 64009, 64057, 64876, 64010, 12, 38, 1101, 17, 10, 43867, 1280, 32, 10, 30879, 4], model_path='microsoft/kosmos-2-patch14-224', estimator='MaximumTokenProbability')
