In [None]:
from typing import Any, Dict, List, Optional, Union, Tuple

import requests
import torch
from lm_polygraph import estimate_uncertainty
from lm_polygraph.estimators import (
    EigValLaplacian,
    LexicalSimilarity,
    MaximumTokenProbability,
    PointwiseMutualInformation,
    SemanticEntropy,
    Focus
)
from lm_polygraph.model_adapters.whitebox_visual import VisualWhiteboxModel
from PIL import Image

%load_ext autoreload
%autoreload 2
from transformers import (
    AutoModelForCausalLM,
    AutoModelForVision2Seq,
    AutoProcessor,
    AutoTokenizer,
)
from dataclasses import dataclass
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# # Load model and processor
# from transformers import AutoConfig

# config = AutoConfig.from_pretrained("microsoft/kosmos-2-patch14-224")

# # Manually set missing attributes
# config.num_attention_heads = 16  # Set an appropriate default
# config.num_hidden_layers = 24   # Set based on the model architecture

# # Assign it back to the model
# model.config = config


base_model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

# Create whitebox model with image
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png"
model = VisualWhiteboxModel(base_model, processor, image_urls=[url])

# Test with input text
input_text = ["<grounding>An image of"]

estimator = MaximumTokenProbability()
uncertainty = estimate_uncertainty(model, estimator, input_text=input_text)
print("Uncertainty estimation:", uncertainty)

Keyword argument `return_dict` is not a valid argument for this processor and will be ignored.


Uncertainty estimation: UncertaintyOutput(uncertainty=array([-0.43950686, -0.55753523, -0.7364028 , -0.63372934, -1.        ,
       -0.96391106, -0.89820856, -0.9997953 , -0.34002206, -0.40260765,
       -0.31840694, -0.872435  , -0.85644513, -0.47168002, -0.9281716 ,
       -1.        , -0.08342137, -0.12618384, -0.99650866], dtype=float32), input_text=['<grounding>An image of'], generation_text='<phrase> A snowman</phrase><object><patch_index_0044><patch_index_0863></object> is sitting by<phrase> a campfire</phrase><object><patch_index_0005><patch_index_1007></object> in', generation_tokens=[64007, 95, 43867, 64008, 64009, 64057, 64876, 64010, 17, 1280, 32, 64007, 10, 30879, 64008, 64009, 64018, 65020, 64010], model_path=None, estimator='MaximumTokenProbability')


In [None]:
estimator = LexicalSimilarity('rougeL')
estimate_uncertainty(model, estimator, input_text=input_text)

In [None]:
estimator = SemanticEntropy()
estimate_uncertainty(model, estimator, input_text=input_text)

In [17]:
estimator = PointwiseMutualInformation()
estimate_uncertainty(model, estimator, input_text=input_text)

Exception: Cant find stat calculator for: greedy_lm_log_likelihoods. Maybe you forgot to register it in lm_polygraph.utils.register_stat_calculators.register_stat_calculators()?

In [18]:
estimator = Focus()
estimate_uncertainty(model, estimator, input_text=input_text)

NameError: name 'Focus' is not defined

In [22]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("microsoft/kosmos-2-patch14-224")
print(config)  # Check what attributes exist

Kosmos2Config {
  "_name_or_path": "microsoft/kosmos-2-patch14-224",
  "architectures": [
    "Kosmos2ForConditionalGeneration"
  ],
  "latent_query_num": 64,
  "model_type": "kosmos-2",
  "text_config": {
    "model_type": "kosmos_2_text_model",
    "no_repeat_ngram_size": 3
  },
  "torch_dtype": "float32",
  "transformers_version": "4.48.0",
  "vision_config": {
    "model_type": "kosmos_2_vision_model"
  }
}



In [23]:
config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf",)
print(config)  # Che

LlavaNextConfig {
  "_name_or_path": "llava-hf/llava-v1.6-mistral-7b-hf",
  "architectures": [
    "LlavaNextForConditionalGeneration"
  ],
  "ignore_index": -100,
  "image_grid_pinpoints": [
    [
      336,
      672
    ],
    [
      672,
      336
    ],
    [
      672,
      672
    ],
    [
      1008,
      336
    ],
    [
      336,
      1008
    ]
  ],
  "image_seq_length": 576,
  "image_token_index": 32000,
  "model_type": "llava_next",
  "multimodal_projector_bias": true,
  "projector_hidden_act": "gelu",
  "text_config": {
    "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
    "architectures": [
      "MistralForCausalLM"
    ],
    "intermediate_size": 14336,
    "max_position_embeddings": 32768,
    "model_type": "mistral",
    "num_key_value_heads": 8,
    "rms_norm_eps": 1e-05,
    "rope_theta": 1000000.0,
    "sliding_window": null,
    "torch_dtype": "bfloat16",
    "vocab_size": 32064
  },
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "tra

In [24]:
from transformers import AutoModel

model = AutoModel.from_pretrained("microsoft/kosmos-2-patch14-224")
print(model.config)

Kosmos2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "microsoft/kosmos-2-patch14-224",
  "architectures": [
    "Kosmos2ForConditionalGeneration"
  ],
  "latent_query_num": 64,
  "model_type": "kosmos-2",
  "text_config": {
    "_attn_implementation_autoset": true,
    "model_type": "kosmos_2_text_model",
    "no_repeat_ngram_size": 3
  },
  "torch_dtype": "float32",
  "transformers_version": "4.48.0",
  "vision_config": {
    "_attn_implementation_autoset": true,
    "model_type": "kosmos_2_vision_model"
  }
}

