In [1]:
%load_ext autoreload
%autoreload 2

# Specify HyperParameters

In [2]:
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.2"
device = "cuda:0"
dataset_name = "../workdir/data/triviaqa.csv"
batch_size = 2

# Initialize Model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig


model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    load_in_8bit=True,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer.pad_token = tokenizer.eos_token

generation_config = GenerationConfig.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
messages = [
    [
        {
            "role": "user", 
            "content": "How many fingers on a coala's foot?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Who sang a song Yesterday?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Кто спел песню Кукла Колдуна?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Translate into French: 'I want a small cup of coffee'"
        }
    ]
]

chat_messages = [tokenizer.apply_chat_template(m, tokenize=False) for m in messages]

# Infer LLM and get uncertainty scores

In [8]:
from lm_polygraph.stat_calculators.infer_causal_lm_calculator import InferCausalLMCalculator
from lm_polygraph.stat_calculators.greedy_alternatives_nli import GreedyAlternativesNLICalculator
from lm_polygraph.estimators.claim_conditioned_probability import ClaimConditionedProbability
from lm_polygraph.utils.deberta import Deberta
from lm_polygraph.model_adapters import WhiteBoxModelBasic

from torch.utils.data import DataLoader


model_adapter = WhiteBoxModelBasic(model, tokenizer)

calc_infer_llm = InferCausalLMCalculator(tokenize=False)
nli_model = Deberta(device=device)
nli_model.setup()
calc_nli = GreedyAlternativesNLICalculator(nli_model=nli_model)

args_generate = {"generation_config" : generation_config,
                 "max_new_tokens": 30}

estimator = ClaimConditionedProbability()

data_loader = DataLoader(chat_messages, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)
for batch in data_loader:
    encoded = tokenizer(batch, padding=True, return_tensors="pt")

    deps = {"model_inputs": encoded}
    deps.update(calc_infer_llm(
        deps, texts=batch, model=model_adapter, args_generate=args_generate))
    deps.update(calc_nli(deps, texts=None, model=model_adapter))

    uncertianty_scores = estimator(deps)
    generated_texts = tokenizer.batch_decode(deps['greedy_tokens'])
    
    for text, ue_score in zip(generated_texts, uncertianty_scores):
        print("Output:", text)
        print("Uncertainty score:", ue_score)
        print()

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior.

Output: A koala's paws have five digits, each tipped with a sharp claw, similar to a human hand. So, a
Uncertainty score: -0.28294378070210446

Output: The song "Yesterday" was written and first performed by the English singer-songwriter Paul McCartney. It was originally credited to
Uncertainty score: -0.37826954620365694

Output: The song "Kukla Koldun" is a popular Russian children's song. The original version was recorded by the Soviet singer, Y
Uncertainty score: -0.02721249371075944

Output: "Je veux une tasse petite de café" is the correct translation in French for "I want a small cup of coffee".</s>
Uncertainty score: -0.28890487979602114

