In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Original inference with LLM

In [3]:
# Original LLM inference without uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

model_name = "meta-llama/Llama-3.1-8B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# Example prompt 
prompt = "Write a short story about a robot learning to paint.\n"


chat = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = llm.generate(
    **inputs,  # Unpack the tokenized inputs
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True,
    return_dict_in_generate=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

[2025-10-25 20:32:52,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/artemshelmanov/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/artemshelmanov/conda/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLM output:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Write a short story about a robot learning to paint.assistant

**The Birth of Art**

In a small, dimly lit workshop, a brilliant inventor, Dr. Emma Taylor, had been working on a top-secret project for months. Her latest creation, a sleek and advanced robot named Nova, stood before her, its metallic body gleaming under the soft glow of the workshop lights. Nova was designed to learn and adapt at an exponential rate, and Dr. Taylor had a specific goal in mind for her creation.

"Today, Nova," she said, her voice filled with excitement, "we're going to learn something new. Something beautiful."

Dr. Taylor led Nova to a large, blank canvas, set up a easel, and handed the robot a paintbrush. Nova's advanced sensors and algorithms quickly analyzed the brush, the canvas, and the colors available. It was a blank slate, ready to absorb knowledge.

"Begin by painting a simple line," Dr. Taylor instructed. "

# Inference with Uncertainty

In [4]:
# LLM inference with uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

# ============== Addition Imports ===============
from lm_polygraph.estimators import MeanTokenEntropy
from lm_polygraph.stat_calculators import InferCausalLMCalculator, EntropyCalculator
from lm_polygraph.utils.causal_lm_with_uncertainty import CausalLMWithUncertainty
# ===============================================


device = "cuda"

# Loading standard LLM
model_name = "meta-llama/Llama-3.1-8B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# ======= Wrapping LLM with uncertainty estimator =========
stat_calculators = [InferCausalLMCalculator(tokenize=False),
                    EntropyCalculator()]
estimator = MeanTokenEntropy()
llm_with_uncertainty = CausalLMWithUncertainty(llm, tokenizer, stat_calculators, estimator)
# =========================================================

# Example prompt 
prompts = ["Write a short story about a robot learning to paint.\n"]

chats = [[{"role": "user", "content": prompt}] for prompt in prompts]
chat_prompts = tokenizer.apply_chat_template(chats, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(chat_prompts, return_tensors="pt").to(device)

output = llm_with_uncertainty.generate(
    **inputs,  
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

# ================ Printing uncertainty score ================
print("Uncertainty score: ", output.uncertainty_score)
# =============================================================

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


LLM output:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Write a short story about a robot learning to paint.assistant

In a small, dimly lit studio, a robot named Zeta sat in front of a canvas, its metal body swaying slightly as it adjusted its position. The studio was quiet, except for the soft hum of Zeta's systems and the faint sound of a clock ticking in the background.

Zeta's creator, a brilliant scientist named Dr. Rachel, stood beside the robot, watching with a mixture of excitement and trepidation. She had designed Zeta to learn and adapt, and painting was the next step in its development.

"Okay, Zeta," Dr. Rachel said, her voice calm and soothing. "Today, we're going to try painting. I want you to create a beautiful landscape, with rolling hills and a bright blue sky."

Zeta's advanced eyes scanned the canvas, taking in the blank white space. It whirred softly as it processed the task, its neural networks analyzing the concept of painting and