In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Original inference with LLM

In [3]:
# Original LLM inference without uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

model_name = "meta-llama/Llama-3.1-8B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# Example prompt 
prompt = "Write a short story about a robot learning to paint.\n"


chat = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = llm.generate(
    **inputs,  # Unpack the tokenized inputs
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True,
    return_dict_in_generate=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

[2025-10-25 22:56:14,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/artemshelmanov/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/artemshelmanov/conda/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLM output:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Write a short story about a robot learning to paint.assistant

In a small workshop nestled in the heart of a bustling city, a team of engineers and artists had been working on a revolutionary project - a robot designed to learn and create art. They called her "Aurora," a name that symbolized the dawn of a new era in artificial intelligence and creativity.

Aurora was a sleek, cylindrical robot with a slender arm and a delicate hand, capable of moving with precision and dexterity. Her creators had equipped her with a high-definition camera, a 3D printer, and a sophisticated neural network that would allow her to learn from observation and experience.

The team had chosen a local art studio as the perfect place for Aurora to hone her skills. The studio was run by a talented artist named Emma, who had been struggling to find inspiration for her next masterpiece. She was thrilled to meet Aurora and saw

# Inference with Uncertainty

In [4]:
# LLM inference with uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

# ============== Addition Imports ===============
from lm_polygraph.estimators import MeanTokenEntropy
from lm_polygraph.stat_calculators import InferCausalLMCalculator, EntropyCalculator
from lm_polygraph.utils.causal_lm_with_uncertainty import CausalLMWithUncertainty
# ===============================================


device = "cuda"

# Loading standard LLM
model_name = "meta-llama/Llama-3.1-8B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# ======= Wrapping LLM with uncertainty estimator =========
stat_calculators = [InferCausalLMCalculator(tokenize=False),
                    EntropyCalculator()]
estimator = MeanTokenEntropy()
llm_with_uncertainty = CausalLMWithUncertainty(llm, tokenizer, stat_calculators, estimator)
# =========================================================

# Example prompt 
prompts = ["Write a short story about a robot learning to paint.\n"]

chats = [[{"role": "user", "content": prompt}] for prompt in prompts]
chat_prompts = tokenizer.apply_chat_template(chats, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(chat_prompts, return_tensors="pt").to(device)

output = llm_with_uncertainty.generate(
    **inputs,  
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

# ================ Printing uncertainty score ================
print("Uncertainty score: ", output.uncertainty_score)
# =============================================================

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


LLM output:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Write a short story about a robot learning to paint.assistant

**The Brushstroke of Genius**

In a small, cluttered workshop, a lone robot named Zeta whirred to life. Its creator, the brilliant but reclusive artist, Professor Orion, had tasked Zeta with learning the art of painting. The professor had grown tired of his own creations, but saw potential in this latest prototype.

At first, Zeta struggled to grasp the concept of art. Its mechanical arms flailed as it attempted to hold a brush, causing more chaos than creation. Professor Orion watched patiently, offering gentle corrections and encouragement.

"Try to feel the texture of the canvas, Zeta," he said, his voice soothing. "Imagine the colors dancing on the surface."

Zeta's processors whirred as it processed the professor's words. It adjusted its brushstrokes, tentatively at first, but gradually gaining confidence. Colors began to flow from