## Install Dependencies

In [3]:
# Assume that you have installed lm-polygraph: 
# pip install git+https://github.com/artemshelmanov/lm-polygraph.git

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m115.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Basic Imports

In [3]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [11]:
%load_ext autoreload
%autoreload 2

from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma3ForCausalLM
from lm_polygraph.utils.model import WhiteboxModel, BlackboxModel
from lm_polygraph import estimate_uncertainty
from lm_polygraph.estimators import MaximumTokenProbability, MaximumSequenceProbability, SemanticEntropy, EigValLaplacian

  from .autonotebook import tqdm as notebook_tqdm


## UQ for Whitebox LLMs

### Initialize model

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import HfApi

# Load the .env file
load_dotenv()

# Get the token from the environment
mytoken = os.getenv("HF_TOKEN")
mytoken2 = os.getenv("HF_TOKEN2")

# Example: Use the token with Hugging Face Hub API
# api = HfApi(token=mytoken)

  from .autonotebook import tqdm as notebook_tqdm


# Final Formating !!!

In [86]:
import pandas as pd

splits = {'train': 'main/train-00000-of-00001.parquet', 'test': 'main/test-00000-of-00001.parquet'}

base_path = "hf://datasets/openai/gsm8k/"

# Read both train and test splits, adding a column to indicate the split
train_df = pd.read_parquet(base_path + splits["train"])
train_df["split"] = "train"
test_df = pd.read_parquet(base_path + splits["test"])
test_df["split"] = "test"

train_df['answer'] = train_df['answer'].str.split('####').str[1].str.strip()
test_df['answer'] = test_df['answer'].str.split('####').str[1].str.strip()

# Preview the result
print('answer:', test_df['answer'].iloc[0])

answer: 18


In [None]:
short = "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer."

train_df['question'] = train_df['question'].apply(lambda q: short.replace("{question}", q))
test_df['question'] = test_df['question'].apply(lambda q: short.replace("{question}", q))

test_df = test_df.drop('split', axis=1)
train_df = train_df.drop('split', axis=1)

train_df.to_csv(f'gsm8k-direct/train.csv', index=False)
test_df.to_csv(f'gsm8k-direct/test.csv', index=False)

In [None]:
cot = "Answer the following question, by explaining your reasoning step-by-step in a single paragraph to determine the correct answer for the following question. After your reasoning, state the single correct numerical value (int), no other symbols, only one integer for the answer on a new line, prefixed with '### Answer:'.\n{question}"

train_df['question'] = train_df['question'].apply(lambda q: cot.replace("{question}", q))
test_df['question'] = test_df['question'].apply(lambda q: cot.replace("{question}", q))

test_df = test_df.drop('split', axis=1)
train_df = train_df.drop('split', axis=1)

train_df.to_csv(f'gsm8k-reasoning/train.csv', index=False)
test_df.to_csv(f'gsm8k-reasoning/test.csv', index=False)

In [72]:
train_df['question'].iloc[0]

'Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nI repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.'

In [None]:
api = HfApi(token=mytoken2)
api.upload_file(
    path_or_fileobj="gsm8k-direct/test.csv",
    path_in_repo="gsm8k-direct/test.csv",
    repo_id="UGRIP-LM-Polygraph/gsm8k-direct",
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="gsm8k-direct/train.csv",
    path_in_repo="gsm8k-direct/train.csv",
    repo_id="UGRIP-LM-Polygraph/gsm8k-direct",
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="gsm8k-reasoning/test.csv",
    path_in_repo="gsm8k-reasoning/test.csv",
    repo_id="UGRIP-LM-Polygraph/gsm8k-reasoning",
    repo_type="dataset"
)

api.upload_file(
    path_or_fileobj="gsm8k-reasoning/train.csv",
    path_in_repo="gsm8k-reasoning/train.csv",
    repo_id="UGRIP-LM-Polygraph/gsm8k-reasoning",
    repo_type="dataset"
)

CommitInfo(commit_url='https://huggingface.co/datasets/UGRIP-LM-Polygraph/gsm8k-direct/commit/b464547c2f26ff12a64618f16bb641b5f30319dc', commit_message='Upload gsm8k-direct/test.csv with huggingface_hub', commit_description='', oid='b464547c2f26ff12a64618f16bb641b5f30319dc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/UGRIP-LM-Polygraph/gsm8k-direct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='UGRIP-LM-Polygraph/gsm8k-direct'), pr_revision=None, pr_num=None)

In [27]:
for i in range(5):
    print(train_df['question'].iloc[i])
    print()

Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
I repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.

Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
I repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.

Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpo

### openai/gsm8k dataset

In [None]:
import pandas as pd

splits = {'train': 'main/train-00000-of-00001.parquet', 'test': 'main/test-00000-of-00001.parquet'}

base_path = "hf://datasets/openai/gsm8k/"

# Read both train and test splits, adding a column to indicate the split
train_df = pd.read_parquet(base_path + splits["train"])
train_df["split"] = "train"
test_df = pd.read_parquet(base_path + splits["test"])
test_df["split"] = "test"

# Combine into one DataFrame
gsm8k_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
gsm8k_df

Unnamed: 0,question,answer,split
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...,train
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...,train
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<...",train
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....,train
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...,train
...,...,...,...
8787,John had a son James when he was 19. James is...,Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...,test
8788,There are some oranges in a basket. Ana spends...,There are 60 minutes in an hour. Ana peels an ...,test
8789,Mark's car breaks down and he needs to get a n...,The discount on the radiator was 400*.8=$<<400...,test
8790,"Farmer Brown has 20 animals on his farm, all e...",Let C be the number of chickens.\nThere are 20...,test


#### Create column 'short_answer' with answers without reasoning

In [5]:
gsm8k_df['short_answer'] = gsm8k_df['answer'].str.split('####').str[1].str.strip()

# Preview the result
print('answer:', gsm8k_df['answer'].iloc[0])
print('short_answer:', gsm8k_df['short_answer'].iloc[0])

answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72
short_answer: 72


#### Create column 'cot_question' for reasoning

In [None]:

model_name = 'google/gemma-3-12b-it'
# 'google/gemma-3-12b-it'
# 'deepseek-ai/deepseek-llm-7b-chat'
# 'meta-llama/Llama-3.1-8B-Instruct'
# 'microsoft/phi-4'
# 'Qwen/Qwen2.5-7B-Instruct'

base_model = AutoModelForCausalLM.from_pretrained( # Gemma3ForCausalLM.from_pretrained(
    model_name,
    token=mytoken,
    device_map='auto' # 'cpu',
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS as PAD for the mistral 7B

model = WhiteboxModel(base_model, tokenizer)

Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.18it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


#### Create column 'short_question' without reasoning

In [13]:
prompts = [
    "Answer this question strictly with the final numerical value. No words, steps, or additional text allowed.\n{question}",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}",
    "Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.\n{question}\nStricktly follow the below answer format:\nvalue",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value. Do not include any calculations or reasoning.",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.",
]

estimator = MaximumSequenceProbability()
results = []

for i in range(0, len(prompts)):
    # Replace {question} in the prompt for each of the first 5 questions
    for j in range(20):
        question = gsm8k_df['question'].iloc[j]
        prompt_text = prompts[i].replace("{question}", question)

        # Estimate uncertainty
        ans = estimate_uncertainty(model, estimator, input_text=prompt_text)

        # Collect result
        results.append({
            "prompt_index": i,
            "question_index": j,
            "input_text": ans.input_text,
            "generation_text": ans.generation_text,
            "ground_truth": gsm8k_df['short_answer'].iloc[j],
            "all": ans
        })

        # Optionally print
        print(f"Prompt {i}, Question {j}")
        print(ans.input_text)
        print(ans.generation_text)
        print()


# Convert new results to DataFrame
results_df = pd.DataFrame(results)

try:
    # Load existing results
    dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}.csv')
    # Append new results
    combined_df = pd.concat([dftemp, results_df], ignore_index=True)
    # Save the combined DataFrame back to CSV
    combined_df.to_csv(f'gsm8k_{model_name.split("/")[1]}.csv', index=False)
except:
    results_df.to_csv(f'gsm8k_{model_name.split("/")[1]}.csv', index=False)


Unsupported: Logger not supported for non-export cases. To avoid graph breaks caused by logger in compile-mode, it is recommended to disable logging by adding logging methods to config.ignore_logger_methods

from user code:
   File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.py", line 864, in forward
    outputs: BaseModelOutputWithPast = self.model(
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.py", line 654, in forward
    layer_outputs = decoder_layer(
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
    return func(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.py", line 463, in forward
    hidden_states, self_attn_weights = self.self_attn(
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/arina.kostina/miniconda3/envs/reasoning_uq/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.py", line 370, in forward
    logger.warning_once(

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Model name
model_name = "google/gemma-3-12b-it"

# Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",       # Automatically put model on available GPUs
#     torch_dtype="auto"       # Optional: use bf16/fp16 if supported
# )

# Create inference pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# # Prompt (use instruction-style prompting)
# prompt = "Write a short story about a robot who learns to paint."

# # Generate output
# outputs = generator(prompt, max_new_tokens=500, do_sample=True, temperature=0.7)

# # Print result
# print(outputs[0]["generated_text"])




# prompts = [
#     "Answer the following question. Explain your reasoning step by step. After your reasoning, state the single correct numerical value (int), no other symbols, only one integer for the answer on a new line, prefixed with '### Answer:'.\n{question}",
#     "Answer the following question, by explaining your reasoning step-by-step in a single paragraph to determine the correct answer for the following question. After your reasoning, state the single correct numerical value (int), no other symbols, only one integer for the answer on a new line, prefixed with '### Answer:'.\n{question}",
# ]

prompts = [
    "Answer this question strictly with the final numerical value. No words, steps, or additional text allowed.\n{question}",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}",
    "Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.\n{question}\nStricktly follow the below answer format:\nvalue",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value. Do not include any calculations or reasoning.",
    "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.",
]

# estimator = MaximumSequenceProbability()
results = []

for i in range(2, len(prompts)):
    # Replace {question} in the prompt for each of the first 5 questions
    for j in range(20):
        question = gsm8k_df['question'].iloc[j]
        prompt_text = prompts[i].replace("{question}", question)

        # Estimate uncertainty
        outputs = generator(prompt_text, max_new_tokens=500, do_sample=True, temperature=0.7)

        # Collect result
        results.append({
            "prompt_index": i,
            "question_index": j,
            "input_text": prompt_text,
            "generation_text": outputs[0]["generated_text"],
            "ground_truth": gsm8k_df['short_answer'].iloc[j],
            "all": outputs
        })

        # Optionally print
        print(f"Prompt {i}, Question {j}")
        print(prompt_text)
        print(outputs[0]["generated_text"])
        print(outputs)
        print()


# Convert new results to DataFrame
results_df = pd.DataFrame(results)

try:
    # Load existing results
    dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}_2.csv')
    # Append new results
    combined_df = pd.concat([dftemp, results_df], ignore_index=True)
    # Save the combined DataFrame back to CSV
    combined_df.to_csv(f'gsm8k_{model_name.split("/")[1]}_2.csv', index=False)
except:
    results_df.to_csv(f'gsm8k_{model_name.split("/")[1]}_2.csv', index=False)


Device set to use cuda:0


Prompt 2, Question 0
Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Stricktly follow the below answer format:
value
Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Stricktly follow the below answer format:
value
```
120
```
[{'generated_text': 'Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nStricktly f

In [16]:
dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv')
dftemp['extracted_answer'] = dftemp['generation_text'].str.extract(r'### Answer:\s*(.*)', expand=False)

print(f'gsm8k_{model_name.split("/")[1]}_cot.csv\n')
# Group by prompt index and compute average output length (in characters)
non_null_counts = dftemp.groupby("prompt_index")['extracted_answer'].count()

print("Average Output Length (Characters) per Prompt:")
for i, avg_len in non_null_counts.items():
    print(f"Prompt {i}: {avg_len:.2f} characters")

gsm8k_gemma-3-12b-it_cot.csv

Average Output Length (Characters) per Prompt:
Prompt 0: 20.00 characters
Prompt 1: 20.00 characters


In [9]:
import pandas as pd
# dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}.csv')
dftemp = pd.read_csv(f'gsm8k_Llama-3.1-8B-Instruct.csv')
dftemp = pd.read_csv(f'gsm8k_gemma-3-12b-it_2.csv')


def extract_output(generation, input_text):
    # Split the generation using the input text
    if input_text in generation:
        return generation.split(input_text, 1)[1]  # return only the part after the input
    else:
        return generation  # fallback if input_text is not found

# Apply the function row-wise and compute output length
dftemp['extracted_output'] = dftemp.apply(lambda row: extract_output(row['generation_text'], row['input_text']), axis=1)
dftemp['output_length_chars'] = dftemp['extracted_output'].apply(len)



# print(f'gsm8k_{model_name.split("/")[1]}.csv\n')
# Group by prompt index and compute average output length (in characters)
avg_lengths = dftemp.groupby("prompt_index")['output_length_chars'].mean()

print("Average Output Length (Characters) per Prompt:")
for i, avg_len in avg_lengths.items():
    print(f"Prompt {i}: {avg_len:.2f} characters")

# Group by prompt and compute median character length
median_lengths_chars = dftemp.groupby("prompt_index")['output_length_chars'].median()

print()
print("Median Output Length (Characters) per Prompt:")
for i, median_len in median_lengths_chars.items():
    print(f"Prompt {i}: {median_len} characters")

Average Output Length (Characters) per Prompt:
Prompt 2: 12.75 characters
Prompt 3: 35.90 characters
Prompt 4: 4.45 characters

Median Output Length (Characters) per Prompt:
Prompt 2: 12.0 characters
Prompt 3: 5.0 characters
Prompt 4: 4.0 characters


In [10]:
dftemp

Unnamed: 0,prompt_index,question_index,input_text,generation_text,ground_truth,all,extracted_output,output_length_chars
0,2,0,Answer the following question. Provide only th...,Answer the following question. Provide only th...,72,[{'generated_text': 'Answer the following ques...,\n```\n120\n```,12
1,2,1,Answer the following question. Provide only th...,Answer the following question. Provide only th...,10,[{'generated_text': 'Answer the following ques...,\n```\n10\n```\n10\n```\n10\n```,25
2,2,2,Answer the following question. Provide only th...,Answer the following question. Provide only th...,5,[{'generated_text': 'Answer the following ques...,\n100\n,5
3,2,3,Answer the following question. Provide only th...,Answer the following question. Provide only th...,42,[{'generated_text': 'Answer the following ques...,\n```\n30\n```,11
4,2,4,Answer the following question. Provide only th...,Answer the following question. Provide only th...,624,[{'generated_text': 'Answer the following ques...,\n```\n1056\n```,13
5,2,5,Answer the following question. Provide only th...,Answer the following question. Provide only th...,35,[{'generated_text': 'Answer the following ques...,\n```\n189\n```,12
6,2,6,Answer the following question. Provide only th...,Answer the following question. Provide only th...,48,[{'generated_text': 'Answer the following ques...,\n```\n56\n```,11
7,2,7,Answer the following question. Provide only th...,Answer the following question. Provide only th...,16,[{'generated_text': 'Answer the following ques...,\n10\n```\n16\n```,14
8,2,8,Answer the following question. Provide only th...,Answer the following question. Provide only th...,41,[{'generated_text': 'Answer the following ques...,\n```\n67\n```,11
9,2,9,Answer the following question. Provide only th...,Answer the following question. Provide only th...,990,[{'generated_text': 'Answer the following ques...,\n```\n1170\n```,13


In [16]:
dft = pd.read_csv("gsm8k_deepseek-llm-7b-chat_cot.csv")
dft = pd.read_csv("gsm8k_Llama-3.1-8B-Instruct_cot.csv")
dft = pd.read_csv("gsm8k_phi-4_cot.csv")
# dft = pd.read_csv("gsm8k_Qwen2.5-7B-Instruct_cot.csv")

dft['prompt_index']
# dft = dft[:-20]  # Keeps all rows except the last 20

# # Optionally, save it back
# dft.to_csv('gsm8k_phi-4_cot.csv', index=False)

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
Name: prompt_index, dtype: int64

In [8]:

# model_name = 'deepseek-ai/deepseek-llm-7b-chat'
models = ['google/gemma-3-12b-it'] # 'google/gemma-3-12b-it', 
# 'google/gemma-3-12b-it'
# 'deepseek-ai/deepseek-llm-7b-chat'
# 'meta-llama/Llama-3.1-8B-Instruct'
# 'microsoft/phi-4'
# 'Qwen/Qwen2.5-7B-Instruct'

for model_name in models:
    # base_model = model
    base_model = Gemma3ForCausalLM.from_pretrained( # Gemma3ForCausalLM.from_pretrained(
        model_name,
        token=mytoken,
        device_map='auto' # 'cpu',
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = WhiteboxModel(base_model, tokenizer)

    prompts = [
        "Answer the following question. Explain your reasoning step by step. After your reasoning, state the single correct numerical value (int), no other symbols, only one integer for the answer on a new line, prefixed with '### Answer:'.\n{question}",
        "Answer the following question, by explaining your reasoning step-by-step in a single paragraph to determine the correct answer for the following question. After your reasoning, state the single correct numerical value (int), no other symbols, only one integer for the answer on a new line, prefixed with '### Answer:'.\n{question}",
    # ]
    # prompts = [
    #     "Answer this question strictly with the final numerical value. No words, steps, or additional text allowed.\n{question}",
    #     "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}",
    #     "Answer the following question. Provide only the final numerical answer. Do not show any steps, explanations, or reasoning.\n{question}\nStricktly follow the below answer format:\nvalue",
    #     "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value. Do not include any calculations or reasoning.",
    #     "Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\n{question}\nI repeat, the answer should only consist of the final numerical value (int), no other symbols, only one integer.",
    ]

    estimator = MaximumSequenceProbability()
    results = []

    for i in range(0, 1): #len(prompts)):
        # Replace {question} in the prompt for each of the first 5 questions
        for j in range(2, 20):
            question = gsm8k_df['question'].iloc[j]
            prompt_text = prompts[i].replace("{question}", question)

            # Estimate uncertainty
            ans = estimate_uncertainty(model, estimator, input_text=prompt_text)

            # Collect result
            results.append({
                "prompt_index": i,
                "question_index": j,
                "input_text": ans.input_text,
                "generation_text": ans.generation_text,
                "ground_truth": gsm8k_df['short_answer'].iloc[j],
                "all": ans
            })

            # Optionally print
            print(f"Prompt {i}, Question {j}")
            print(ans.input_text)
            print(ans.generation_text)
            print(ans)
            print(len(ans.generation_tokens))
            print()


    # Convert new results to DataFrame
    results_df = pd.DataFrame(results)

    try:
        # Load existing results
        dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv')
        # Append new results
        combined_df = pd.concat([dftemp, results_df], ignore_index=True)
        # Save the combined DataFrame back to CSV
        combined_df.to_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv', index=False)
    except:
        results_df.to_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv', index=False)

Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


KeyboardInterrupt: 

In [16]:
dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv')
dftemp

Unnamed: 0,prompt_index,question_index,input_text,generation_text,ground_truth,all
0,0,0,Answer the following question. Explain your re...,Here's how to solve the problem step-by-step:\...,72,UncertaintyOutput(uncertainty=4.67648744583129...
1,0,1,Answer the following question. Explain your re...,Here's how to solve the problem step-by-step:\...,10,UncertaintyOutput(uncertainty=5.50194978713989...


In [21]:
dftemp = pd.read_csv(f'gsm8k_{model_name.split("/")[1]}_cot.csv')
dftemp['extracted_answer'] = dftemp['generation_text'].str.extract(r'### Answer:\s*(.*)', expand=False)

print(f'gsm8k_{model_name.split("/")[1]}_cot.csv\n')
# Group by prompt index and compute average output length (in characters)
non_null_counts = dftemp.groupby("prompt_index")['extracted_answer'].count()

print("Average Output Length (Characters) per Prompt:")
for i, avg_len in non_null_counts.items():
    print(f"Prompt {i}: {avg_len:.2f} characters")

gsm8k_deepseek-llm-7b-chat_cot.csv

Average Output Length (Characters) per Prompt:
Prompt 0: 19.00 characters
Prompt 1: 20.00 characters


### Sequence-level UQ for a Whitebox LLM

In [8]:
df = pd.read_csv("gsm8k_gemma-3-12b-it.csv")
df

Unnamed: 0,prompt_index,question_index,input_text,generation_text,ground_truth,all
0,0,0,Answer this question strictly with the final n...,96<end_of_turn>,72,UncertaintyOutput(uncertainty=0.62811964750289...
1,0,1,Answer this question strictly with the final n...,10\n<end_of_turn>,10,UncertaintyOutput(uncertainty=0.00912463292479...
2,0,2,Answer this question strictly with the final n...,20\n<end_of_turn>,5,UncertaintyOutput(uncertainty=0.41650879383087...
3,0,3,Answer this question strictly with the final n...,30<end_of_turn>,42,UncertaintyOutput(uncertainty=0.67018395662307...
4,0,4,Answer this question strictly with the final n...,1056<end_of_turn>,624,UncertaintyOutput(uncertainty=0.71765488386154...
...,...,...,...,...,...,...
95,4,15,"Answer the following question, by giving only ...",145000<end_of_turn>,448000,UncertaintyOutput(uncertainty=1.03194749355316...
96,4,16,"Answer the following question, by giving only ...",1800<end_of_turn>,800,UncertaintyOutput(uncertainty=0.96552979946136...
97,4,17,"Answer the following question, by giving only ...",65<end_of_turn>,43,UncertaintyOutput(uncertainty=0.36664539575576...
98,4,18,"Answer the following question, by giving only ...",20<end_of_turn>,16,UncertaintyOutput(uncertainty=0.38927680253982...


In [None]:
estimator = MaximumSequenceProbability()
ans = estimate_uncertainty(model, estimator, input_text=gsm8k_df['short_question'].iloc[0])
print(type(ans))
print(ans)
print(ans.input_text)
print('answer:', ans.generation_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<class 'lm_polygraph.utils.estimate_uncertainty.UncertaintyOutput'>
UncertaintyOutput(uncertainty=22.52058219909668, input_text='Answer the following question, by giving only the final answer, without any calculations, reasoning, or explanation.\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', generation_text='\n\nA. 48\nB. 72\nC. 96\nD. 144\n\nSolution: The correct option is C 96\n\nTo find the total number of clips Natalia sold in April and May, we need to add the number of clips she sold in each month.\n\nIn April, Natalia sold 48 clips to her friends.\n\nIn May, she sold half as many clips as in April, which means she sold 48 / 2 = 24 clips.\n\n', generation_tokens=[198, 198, 32, 13, 4764, 198, 33, 13, 7724, 198, 34, 13, 9907, 198, 35, 13, 20224, 198, 198, 46344, 25, 383, 3376, 3038, 318, 327, 9907, 198, 198, 2514, 1064, 262, 2472, 1271, 286, 19166, 14393, 9752, 2702, 287, 

In [5]:
estimator = MaximumSequenceProbability()
estimate_uncertainty(model, estimator, input_text='What has a head and a tail but no body?')

UncertaintyOutput(uncertainty=61.402503967285156, input_text='What has a head and a tail but no body?', generation_text="The answer to this question is a virus. Viruses are small, non-living entities that can only replicate within living cells. They do not have a head or a tail, but they can cause harm to living organisms by attaching to and hijacking the host cell's machinery. Viruses are a significant threat to public health and can cause a wide range of diseases, including but not limited to, influenza, HIV, and cancer.", generation_tokens=[785, 4226, 311, 419, 3405, 374, 264, 16770, 13, 9542, 4776, 525, 2613, 11, 2477, 2852, 2249, 14744, 429, 646, 1172, 45013, 2878, 5382, 7761, 13, 2379, 653, 537, 614, 264, 1968, 476, 264, 9787, 11, 714, 807, 646, 5240, 11428, 311, 5382, 43204, 553, 71808, 311, 323, 21415, 8985, 279, 3468, 2779, 594, 25868, 13, 9542, 4776, 525, 264, 5089, 5899, 311, 584, 2820, 323, 646, 5240, 264, 6884, 2088, 315, 18808, 11, 2670, 714, 537, 7199, 311, 11, 61837, 11

In [6]:
# It takes 2 mins to run the example.

estimator = SemanticEntropy()
estimate_uncertainty(model, estimator, input_text='How many floors are in the Empire State Building?')

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


UncertaintyOutput(uncertainty=99.17972927011878, input_text='How many floors are in the Empire State Building?', generation_text='The Empire State Building has 105 floors.', generation_tokens=[785, 20448, 3234, 16858, 702, 220, 16, 15, 20, 25945, 13], model_path=None, estimator='SemanticEntropy')

In [None]:
# It takes 2 mins to run the example.

estimator = SemanticEntropy()
estimate_uncertainty(model, estimator, input_text='What has a head and a tail but no body?')

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


UncertaintyOutput(uncertainty=115.85714697341697, input_text='What has a head and a tail but no body?', generation_text="The answer to this question is a virus. Viruses are small, non-living entities that can only replicate within living cells. They do not have a head or a tail, but they can cause harm to living organisms by attaching to and hijacking the host cell's machinery. Viruses are a significant threat to public health and can cause a wide range of diseases, including but not limited to, influenza, HIV, and cancer.", generation_tokens=[785, 4226, 311, 419, 3405, 374, 264, 16770, 13, 9542, 4776, 525, 2613, 11, 2477, 2852, 2249, 14744, 429, 646, 1172, 45013, 2878, 5382, 7761, 13, 2379, 653, 537, 614, 264, 1968, 476, 264, 9787, 11, 714, 807, 646, 5240, 11428, 311, 5382, 43204, 553, 71808, 311, 323, 21415, 8985, 279, 3468, 2779, 594, 25868, 13, 9542, 4776, 525, 264, 5089, 5899, 311, 584, 2820, 323, 646, 5240, 264, 6884, 2088, 315, 18808, 11, 2670, 714, 537, 7199, 311, 11, 61837, 11

### Token-level UQ for Whitebox LLM

In [8]:
estimator = MaximumTokenProbability()
estimate_uncertainty(model, estimator, input_text='What has a head and a tail but no body?')

UncertaintyOutput(uncertainty=array([-0.5199645 , -0.7773075 , -0.9111757 , -0.4642391 , -0.7312121 ,
       -0.96189463, -0.27488053, -0.11001918, -0.72922415, -0.43932858,
       -0.99928606, -0.5605216 , -0.23990016, -0.7537944 , -0.24525657,
       -0.6493522 , -0.99975866, -0.51964307, -0.8791548 , -0.21423468,
       -0.35266286, -0.47724998, -0.45849365, -0.38198572, -0.8100885 ,
       -0.6418665 , -0.7694248 , -0.30586314, -0.99927765, -0.9042026 ,
       -0.87831134, -0.9398997 , -0.52133787, -0.5053506 , -0.9973978 ,
       -0.67501503, -0.49679896, -0.7429459 , -0.35794568, -0.10462207,
       -0.16433331, -0.5641733 , -0.80949354, -0.7477126 , -0.21802613,
       -0.12109879, -0.6510552 , -0.2837354 , -0.2947866 , -0.98860633,
       -0.5185005 , -0.1949296 , -0.51526326, -0.9765778 , -0.7364911 ,
       -0.442843  , -0.2552529 , -0.9931718 , -0.49535578, -0.17133856,
       -0.2405345 , -0.5826117 , -0.89956623, -0.22676547, -0.9783974 ,
       -0.7804185 , -0.30011195, -