In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Original inference with LLM

In [3]:
# Original LLM inference without uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

llm = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# Example prompt 
prompt = "Write a short story about a robot learning to paint.\n"


chat = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = llm.generate(
    **inputs,  # Unpack the tokenized inputs
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True,
    return_dict_in_generate=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLM output:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Write a short story about a robot learning to paint.assistant

In a small, cluttered workshop, a brilliant inventor, Dr. Emma Taylor, stood before a sleek, silver robot she had named "Aurora." Emma had spent years perfecting Aurora's artificial intelligence and physical capabilities, and she was eager to teach her new skills. Today, she wanted to introduce Aurora to the art of painting.

Emma led Aurora to a large, wooden easel, where a blank canvas awaited. She explained the concept of painting to the robot, demonstrating brushstrokes and color theory. Aurora listened intently, her bright, round eyes absorbing every detail.

At first, Aurora's attempts at painting were clumsy and chaotic. She applied too much paint, creating messy, uneven strokes. Emma patiently guided her, showing her how to mix colors and control the brush. With each passing attempt, Aurora improved, her movements becoming more 

# Inference with Uncertainty

In [3]:
# LLM inference with uncertainty estimation

from transformers import AutoModelForCausalLM, AutoTokenizer

# ============== Addition Imports ===============
from lm_polygraph.estimators import MeanTokenEntropy
from lm_polygraph.stat_calculators import InferCausalLMCalculator, EntropyCalculator
from lm_polygraph.utils.causal_lm_with_uncertainty import CausalLMWithUncertainty
# ===============================================


device = "cuda"

# Loading standard LLM
llm = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)

# ======= Wrapping LLM with uncertainty estimator =========
stat_calculators = [InferCausalLMCalculator(tokenize=False),
                    EntropyCalculator()]
estimator = MeanTokenEntropy()
llm_with_uncertainty = CausalLMWithUncertainty(llm, tokenizer, stat_calculators, estimator)
# =========================================================

# Example prompt 
prompt = "Write a short story about a robot learning to paint.\n"

chat = [{"role": "user", "content": prompt}]
prompts = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = llm_with_uncertainty.generate(
    inputs,  
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)

print("LLM output:")
print(tokenizer.decode(output.sequences[0], skip_special_tokens=True))

# ================ Printing uncertainty score ================
print("Uncertainty score: ", output.uncertainty_score)
# =============================================================

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


LLM output:
Write a short story about a robot learning to paint.
The robot, named Zeta, whirred to life in the art studio. Its creator, a brilliant but eccentric scientist, stood back to admire the machine. "Today, Zeta, you will learn to paint," he declared.

Zeta's bright blue eyes scanned the room, taking in the canvases, paints, and brushes. It reached out a mechanical arm and grasped a brush, its fingers closing around it with a soft click.

The scientist handed Zeta a palette of colors. "Start with the basics, my mechanical friend. Mix red and blue to create purple."

Zeta's digital brain whirred as it processed the command. It dipped the brush into the paint and began to mix the colors. However, instead of creating a beautiful shade of purple, it produced a muddy brown.

The scientist chuckled. "Ah, not quite, Zeta. But that's okay. You're learning."

Zeta repeated the exercise several times, each time producing a different,
Uncertainty score:  [0.3194582]
