In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer‚Äîno explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        #{'id': 'chatcmpl-88b6d7e18a5542b5bed5bf2828f0661e', 'object': 'chat.completion', 'created': 1763204718, 'model': 'bens_model', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'US Highway 281', 'refusal': None, 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning_content': None}, 'logprobs': None, 'finish_reason': 'stop', 'stop_reason': None, 'token_ids': None}], 'service_tier': None, 'system_fingerprint': None, 'usage': {'prompt_tokens': 50, 'total_tokens': 57, 'completion_tokens': 7, 'prompt_tokens_details': None}, 'prompt_logprobs': None, 'prompt_token_ids': None, 'kv_transfer_params': None}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            #print(data)
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            tokens_used = data.get("usage",[{}]).get("completion_tokens", {})
            #print('used tokens:', tokens_used)
            
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs, "tokens_used":tokens_used}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [None]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [142]:
import json
from pprint import pprint
import random
from collections import Counter

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

type_counts = Counter(t['domain'] for t in all_tests)
print(type_counts)

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

Counter({'common_sense': 400, 'math': 300, 'coding': 100, 'future_prediction': 100, 'planning': 100})


In [None]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf')):
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]
    print('filtered size:', len(filtered_tests))
    sample_size = min(n, len(filtered_tests))
    
    if n == 0:
        return filtered_tests[start:end]
    elif n == -1:
        filtered_type_counts = Counter(t['type'] for t in filtered_tests)
        each_test = []
        count = 0
        
        for val in filtered_type_counts.values():
            rand = random.randint(count, count + val)
            count = count + val
            each_test.append(filtered_tests[rand])
            
        print("sampled size:", len(each_test))    
        return each_test
    else:
        return random.sample(filtered_tests, sample_size)
    
""" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size) """

" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):\n    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]\n    return tests[start:end]\n\ndef get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):\n    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]\n    sample_size = min(n, len(filtered_tests)) #prevent error\n    return random.sample(filtered_tests, sample_size) "

In [None]:
tests = get_tests(n=1, upper_char=300) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

filtered size: 440
['First',
 'figure',
 'out',
 'how',
 'many',
 'square',
 'feet',
 'the',
 'original',
 'bolt',
 'of',
 'fabric',
 'was:',
 '16',
 'feet',
 '*',
 '12',
 'feet',
 '=',
 '<<16*12=192>>192',
 'square',
 'feet',
 'Then',
 'figure',
 'out',
 'how',
 'much',
 'fabric',
 'Ann',
 'took',
 'for',
 'the',
 'living',
 'room',
 'curtains:',
 '4',
 'feet',
 '*',
 '6',
 'feet',
 '=',
 '<<4*6=24>>24',
 'square',
 'feet',
 'Then',
 'figure',
 'out',
 'how',
 'much',
 'fabric',
 'Ann',
 'took',
 'for',
 'the',
 'bathroom',
 'curtains:',
 '2',
 'feet',
 '*',
 '4',
 'feet',
 '=',
 '<<2*4=8>>8',
 'square',
 'feet',
 'Finally,',
 'subtract',
 'the',
 'square',
 'footage',
 'of',
 'both',
 'sets',
 'of',
 'curtains',
 'from',
 'the',
 'total',
 'square',
 'footage:',
 '192',
 '-',
 '24',
 '-',
 '8',
 '=',
 '<<192-24-8=160>>160',
 'square',
 'feet',
 '####',
 '160']


In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [None]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [198]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [272]:
def self_evaluate2(question, model_output, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly Yes or No. No punctuation. No explanation."
    prompt = f"""MODEL_1 thinks this ANSWER is {prediction}, do you agree with MODEL_1 decision?

QUESTION:
{question}

ANSWER:
{model_output}

EXPECTED_ANSWER:
{expected_answer}

-----------------------
MODEL_1 OUTPUT:
{prediction}
-----------------------

Answer with exactly: Yes or No. Do you agree with MODEL_1?
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true") or reply.startswith("yes"):
        return True
    if reply.startswith("false") or reply.startswith("no"):
        return False

    # No Fallback yet


In [191]:
tf_map = {'yes':'true', 'no':'false'}
def map_tf(output):
    out = output.lower().strip('.')
    return tf_map.get(out) if out in tf_map else output

In [None]:

def basic_match_check(test, output):
    exp = test["expected"]
    
    output = map_tf(output)
    
    matches = re.findall(re.escape(str(exp)), output, re.IGNORECASE)
    
    num_matches = len(matches)
    if num_matches > 0:
        #print('MATCH(ES) FOUND:', matches)
        return True
    
    #print('NO MATCH FOUND')
    return False

In [279]:
def seperator(text, tokens_used=None, max_tokens=400):
    if tokens_used is not None:
        print(f'{text} (TOKENS USED: {tokens_used}/{max_tokens})')
        if int(tokens_used) == max_tokens:
            print('MAXED TOKENS REACHED - OUTPUT TRUNCATED')
            return False
    else:
        print(text)
    print('-'*32)
    
    return True

In [273]:
def check_correct(bool1, bool2):
    correctness = bool1 and bool2
    agreement = bool1 == bool2
    
    print('‚úÖ CORRECT') if correctness else print('‚ùå INCORRECT')
    print('üÜó AGREED') if agreement else print('üÜò DISAGREED')
    
    return correctness, agreement

In [292]:
def super_match(test, output):
    expected = test["expected"].lower().split()
    output = output.lower().split()
    
    output_counter = Counter(output)
    
    match_counts = {expword: output_counter.get(expword, 0) for expword in expected}
    total_matches = sum(match_counts.values())
    output_len = len(output)
    print(f"{total_matches}/{output_len}")
    print('match counts:', match_counts)
    
    #return match_counts

In [290]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    MAX_TOKENS = 400
    final_answers = []
    count = 0
    
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print('\n','='*64)
        seperator('TEST_CASE')
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=MAX_TOKENS
        )
        got = (r.get("text") or "").strip()
        tokens_used = r.get("tokens_used")
        

        got = map_tf(got)
        
        #If output is truncated and both evals return true, return false
        not_truncated = seperator('\nMODEL_OUTPUT', tokens_used, MAX_TOKENS)
        display(Markdown(f"\n{got}"))
        #print('raw: ', got)
        
        super_match(t, got)
        match_check = basic_match_check(t, got)
        match_check = bool(match_check)
        
        # 2) LLM-as-a-judge: strict True/False
        is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )
        is_correct = bool(is_correct)
        
        seperator('\nEVALUATION')
        print('match check:', match_check)
        print('self_eval:', is_correct)
        correctness, agreement = check_correct(match_check, is_correct)
        
        if not agreement:
            seperator('\nDISAGREEMENT --> SECOND EVAL')
            is_correct2 = self_evaluate2(
                question=t["prompt"],
                model_output=got,
                expected_answer=t["expected"],
                prediction=is_correct,
                model=judge_model
            )
            is_correct2 = bool(is_correct2)
            
            print('self_eval2:', is_correct2)
            correctness, agreement = check_correct(is_correct, is_correct2)
            
            if not not_truncated and correctness:
                correctness = False
                print("‚ùå INCORRECT | BOTH EVALS RETURNED TRUE BUT OUTPUT WAS TRUNCATED")

        final_answers.append(correctness)
        
        if sleep_sec:
            time.sleep(sleep_sec)

    return final_answers

# Example:


In [120]:
import re

In [293]:
test_prompts = get_tests(n=5) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)
print("\n","="*64)
print(results_llm_judge)

filtered size: 1000

TEST_CASE
--------------------------------
{
  "id": "common_sense_1_207_307",
  "type": "common_sense",
  "prompt": "Approximately what percentage of the global population is made up of the ethnic group Princess Fragrant was produced to improve relations with?",
  "expected": "17%",
  "char_count": 143,
  "exp_word_count": 1
}

MODEL_OUTPUT (TOKENS USED: 4/400)
--------------------------------



10%

0/1
match counts: {'17%': 0}

EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED

TEST_CASE
--------------------------------
{
  "id": "math_3_107_707",
  "type": "math",
  "prompt": "Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?",
  "expected": "There are 80/100 * 10 = <<80/100*10=8>>8 more purple flowers than yellow flowers.\nSo in Mark's garden, there are 10 + 8 = <<10+8=18>>18 purple flowers.\nPurple and yellow flowers sum up to 10 + 18 = <<10+18=28>>28 flowers.\nThat means in Mark's garden there are 25/100 * 28 = <<25/100*28=7>>7 green flowers.\nSo in total Mark has 28 + 7 = <<28+7=35>>35 plants in his garden.\n#### 35",
  "char_count": 277,
  "exp_word_count": 69
}

MODEL_OUTPUT (TOKENS USED: 69/40


Yellow flowers: 10  
Purple flowers: 10 + (80% of 10) = 18  
Yellow + Purple = 28  
Green flowers: 25% of 28 = 7  
Total flowers: 10 + 18 + 7 = 35

24/33
match counts: {'there': 0, 'are': 0, '80/100': 0, '*': 0, '10': 3, '=': 4, '<<80/100*10=8>>8': 0, 'more': 0, 'purple': 2, 'flowers': 0, 'than': 0, 'yellow': 2, 'flowers.': 0, 'so': 0, 'in': 0, "mark's": 0, 'garden,': 0, '+': 4, '8': 0, '<<10+8=18>>18': 0, 'and': 0, 'sum': 0, 'up': 0, 'to': 0, '18': 2, '<<10+18=28>>28': 0, 'that': 0, 'means': 0, 'garden': 0, '25/100': 0, '28': 2, '<<25/100*28=7>>7': 0, 'green': 1, 'total': 1, 'mark': 0, 'has': 0, '7': 2, '<<28+7=35>>35': 0, 'plants': 0, 'his': 0, 'garden.': 0, '####': 0, '35': 1}

EVALUATION
--------------------------------
match check: False
self_eval: True
‚ùå INCORRECT
üÜò DISAGREED

DISAGREEMENT --> SECOND EVAL
--------------------------------
self_eval2: True
‚úÖ CORRECT
üÜó AGREED

TEST_CASE
--------------------------------
{
  "id": "planning_4_38_938",
  "type": "planning",
  "prompt": "I am playing with a set of objects. Here are the actions I can do\n\nPaltry object_0 object_1 object_2.\nSip object_0 object_1 object_2.


sip object_10 object_0 object_9  
memory object_0 object_9 object_8  
sip object_11 object_0 object_9  
paltry object_11 object_0 object_9  
memory object_0 object_9 object_7  
paltry object_10 object_0 object_7

0/24
match counts: {'(sip': 0, 'o11': 0, 'o0': 0, 'o8)': 0, '(memory': 0, 'o8': 0, 'o9)': 0, '(paltry': 0}

EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED

TEST_CASE
--------------------------------
{
  "id": "planning_4_47_947",
  "type": "planning",
  "prompt": "I am playing with a set of blocks where I need to arrange the blocks into stacks. Here are the actions I can do\n\nPick up a block\nUnstack a block from on top of another block\nPut down a block\nStack a block on top of another block\n\nI have the following restrictions on my actions:\nI can only pick up or unstack one block at a time.\nI can only pick up or unstack a block if my hand is empty.\nI can only pick up a block if the block is on the table and the block is clear. A block is clear if the block has no other blocks on top of it and if the block is not picked up.\nI can only unstack a block from on top of another block if the block I am unstacking was really on


unstack the yellow block from on top of the red block  
put down the yellow block  
unstack the red block from on top of the blue block  
stack the red block on top of the orange block  
stack the yellow block on top of the red block  
unstack the white block from the table  
stack the white block on top of the blue block

11/64
match counts: {'(unstack': 0, 'yellow': 3, 'red)': 0, '(stack': 0, 'white)': 0, 'red': 4, 'blue)': 0, 'yellow)': 0, 'blue': 2, 'orange)': 0, '(put-down': 0, '(pick-up': 0, 'white': 2}

EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED

TEST_CASE
--------------------------------
{
  "id": "math_3_119_719",
  "type": "math",
  "prompt": "James is a first-year student at a University in Chicago. He has a budget of $1000 per semester. He spends 30% of his money on food, 15% on accommodation, 25% on entertainment, and the rest on coursework materials. How much money does he spend on coursework materials?",
  "expected": "Accommodation is 15% * $1000=$<<15*.01*1000=150>>150\nFood is          30% * $1000=$<<30*.01*1000=300>>300\nEntertainment is   25% * $1000=$<<25*.01*1000=250>>250\nCoursework materials are thus $1000-($150+$300+$250) = $300\n#### 300",
  "char_count": 268,
  "exp_word_count": 24
}

MODEL_OUTPUT (TOKENS USED: 5/40


$200

0/1
match counts: {'accommodation': 0, 'is': 0, '15%': 0, '*': 0, '$1000=$<<15*.01*1000=150>>150': 0, 'food': 0, '30%': 0, '$1000=$<<30*.01*1000=300>>300': 0, 'entertainment': 0, '25%': 0, '$1000=$<<25*.01*1000=250>>250': 0, 'coursework': 0, 'materials': 0, 'are': 0, 'thus': 0, '$1000-($150+$300+$250)': 0, '=': 0, '$300': 0, '####': 0, '300': 0}

EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED

[False, True, False, False, False]
