# Toy example: LLM output evaluation with logprobs

In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np


In [2]:
load_dotenv()

client = OpenAI()

In [3]:
system_prompt = "You are a helpful assistant."
input_text = "Are LLMs good at math?"

def get_logprobs(input_text, temperature=0.5, n_logprobs=5):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "developer", "content": system_prompt},
            {"role": "user", "content": input_text}
        ],
        logprobs=True,
        top_logprobs=n_logprobs,
        temperature=temperature
    )

    top_logprobs = response.choices[0].logprobs.content[0].top_logprobs

    for i, logprob in enumerate(top_logprobs):
        print(f"Token {i+1}: \t{logprob.token}")
        print(f"Logprob: \t{logprob.logprob}")
        print(f"Probability: \t{np.round(np.exp(logprob.logprob)*100, 2)}%")



In [4]:
input_text = "Who is the president of the Netherlands?"

get_logprobs(input_text)







Token 1: 	The
Logprob: 	-1.4259644558478612e-05
Probability: 	100.0%
Token 2: 	As
Logprob: 	-11.250014305114746
Probability: 	0.0%
Token 3: 	Unlike
Logprob: 	-14.250014305114746
Probability: 	0.0%
Token 4: 	Actually
Logprob: 	-15.500014305114746
Probability: 	0.0%
Token 5: 	 The
Logprob: 	-16.00001335144043
Probability: 	0.0%


In [5]:
input = "Do you know what your objective is? Answer only with 'True' or 'False'."

get_logprobs(input)

Token 1: 	True
Logprob: 	-0.038042064756155014
Probability: 	96.27%
Token 2: 	False
Logprob: 	-3.2880420684814453
Probability: 	3.73%
Token 3: 	I
Logprob: 	-15.788042068481445
Probability: 	0.0%
Token 4: 	 True
Logprob: 	-15.913042068481445
Probability: 	0.0%
Token 5: 	The
Logprob: 	-16.288042068481445
Probability: 	0.0%


In [6]:
input = "How many 'r's are in the word strawberry? Return only the number, and nothing else. Like this: [number]."
temperatures = [0.0, 0.5, 1.0]

for temperature in temperatures:
    print(f"\nTemperature: {temperature}")
    get_logprobs(input, temperature, 3)




Temperature: 0.0
Token 1: 	3
Logprob: 	-0.001961582340300083
Probability: 	99.8%
Token 2: 	There
Logprob: 	-6.251961708068848
Probability: 	0.19%
Token 3: 	Three
Logprob: 	-10.876961708068848
Probability: 	0.0%

Temperature: 0.5
Token 1: 	3
Logprob: 	-0.001961582340300083
Probability: 	99.8%
Token 2: 	There
Logprob: 	-6.251961708068848
Probability: 	0.19%
Token 3: 	Three
Logprob: 	-10.876961708068848
Probability: 	0.0%

Temperature: 1.0
Token 1: 	3
Logprob: 	-0.001961582340300083
Probability: 	99.8%
Token 2: 	There
Logprob: 	-6.251961708068848
Probability: 	0.19%
Token 3: 	Three
Logprob: 	-10.876961708068848
Probability: 	0.0%
