In [None]:
%load_ext autoreload
%autoreload 2

# Specify HyperParameters

In [None]:
model_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"
device = "cuda:0"
dataset_name = "../workdir/data/triviaqa.csv"
batch_size = 2

# Initialize Model

In [None]:
import os
from vllm import LLM, SamplingParams

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

llm = LLM(model=model_name_or_path, gpu_memory_utilization=0.5)
sampling_params = SamplingParams(max_tokens=30, logprobs=20)

In [None]:
messages = [
    [
        {
            "role": "user", 
            "content": "How many fingers on a coala's foot?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Who sang a song Yesterday?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Кто спел песню Кукла Колдуна?"
        }
    ],
    [
        {
            "role": "user",
            "content": "Translate into French: 'I want a small cup of coffee'"
        }
    ]
]

tokenizer = llm.get_tokenizer()
chat_messages = [tokenizer.apply_chat_template(m, tokenize=False) for m in messages]

# Infer LLM and get uncertainty scores

In [None]:
from lm_polygraph.model_adapters import WhiteboxModelvLLM
from lm_polygraph.stat_calculators.greedy_alternatives_nli import GreedyAlternativesNLICalculator
from lm_polygraph.stat_calculators.cross_encoder_similarity import CrossEncoderSimilarityMatrixCalculator
from lm_polygraph.stat_calculators.semantic_matrix import SemanticMatrixCalculator
from lm_polygraph.stat_calculators.semantic_classes import SemanticClassesCalculator
from lm_polygraph.stat_calculators.greedy_probs import GreedyProbsCalculator
from lm_polygraph.stat_calculators.sample import SamplingGenerationCalculator

from lm_polygraph.estimators import MaximumSequenceProbability, ClaimConditionedProbability, DegMat, SemanticEntropy, SAR

from lm_polygraph.utils.deberta import Deberta

from torch.utils.data import DataLoader

model_adapter = WhiteboxModelvLLM(llm, sampling_params, device=device)

calc_infer_llm = GreedyProbsCalculator()
nli_model = Deberta(device=device)
nli_model.setup()
calc_nli = GreedyAlternativesNLICalculator(nli_model=nli_model)

calc_samples = SamplingGenerationCalculator()
calc_cross_encoder = CrossEncoderSimilarityMatrixCalculator()
calc_semantic_matrix = SemanticMatrixCalculator(nli_model=nli_model)
calc_semantic_classes = SemanticClassesCalculator()

estimators = [MaximumSequenceProbability(), 
              ClaimConditionedProbability(),
              DegMat(), 
              SemanticEntropy(), 
              SAR()]

In [None]:
data_loader = DataLoader(chat_messages, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)
for batch in data_loader:
    deps = {"input_texts": batch}
    deps.update(calc_infer_llm(deps, texts=batch, model=model_adapter))
    deps.update(calc_nli(deps, texts=batch, model=model_adapter))
    deps.update(calc_samples(deps, texts=batch, model=model_adapter))
    deps.update(calc_cross_encoder(deps, texts=batch, model=model_adapter))
    deps.update(calc_semantic_matrix(deps, texts=batch, model=model_adapter))
    deps.update(calc_semantic_classes(deps, texts=batch, model=model_adapter))
    
    generated_texts = tokenizer.batch_decode(deps['greedy_tokens'])
    ues = []
    for estimator in estimators:
        uncertainty_scores = estimator(deps)
        ues.append((str(estimator), uncertainty_scores))

    for i, text in enumerate(generated_texts):
        print("Output:", text)
        for scores in ues:
            print(f"Uncertainty score by {scores[0]}: {scores[1][i]}")
        print()