In [88]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [89]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [91]:
import json
from pprint import pprint
import random

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count']
    })
    
all_tests = formatted_tests

In [92]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size)

In [93]:
tests = get_random_tests(upper=400, test_type=['math']) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

[{'char_count': 84,
  'expected': '-125',
  'id': 'math_3_272_872',
  'prompt': 'Find the constant term in the expansion of '
            '$$\\left(10x^3-\\frac{1}{2x^2}\\right)^{5}$$',
  'type': 'math'},
 {'char_count': 275,
  'expected': 'Jerry will spend 8 games x 2 hours per game = <<8*2=16>>16 '
              'hours watching one daughter play her games.\n'
              'He will spend 16 x 2 = <<16*2=32>>32 hours watching both '
              'daughters play their games.\n'
              'He will spend 8 games x 4 hours of practice = <<8*4=32>>32 '
              'hours watching one daughter practice.\n'
              'He will spend 32 x 2 = <<32*2=64>>64 hours watching both '
              'daughters practice.\n'
              'He will spend a total of 32 hours watching games + 64 hours '
              'watching practice = <<32+64=96>>96 hours.\n'
              '#### 96',
  'id': 'math_3_110_710',
  'prompt': 'Jerry’s two daughters play softball on different teams. They each '
   

In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [96]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [97]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [110]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    rows = []
    count = 0
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=300
        )
        got = (r.get("text") or "").strip()
        display(Markdown(f"OUTPUT: \n{got}"))
        # 2) LLM-as-a-judge: strict True/False
        """ is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )

        row = {
            "id": t.get("id", "<unnamed>"),
            "expected": t["expected"],
            "got": got,
            "correct": bool(is_correct),
            "status": r.get("status"),
            "error": r.get("error"),
        }
        
        rows.append(row)
        print(json.dumps(row, indent=2, ensure_ascii=False))
        if verbose:
            mark = "✅" if is_correct else "❌"
            print(f"{mark} {row['id']}: expected={row['expected']!r}, got={row['got']!r} (HTTP {row['status']})")
            if row["error"]:
                print("   error:", row["error"]) """

        if sleep_sec:
            time.sleep(sleep_sec)

    return rows

# Example:


In [112]:
test_prompts = get_random_tests(n=10, upper=300, test_type=["math"])#get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)

{
  "id": "math_3_185_785",
  "type": "math",
  "prompt": "The set of points $(x,y,z)$ that satisfy\n\\[2x = 3y = -z\\]is a line.\n\nThe set of points $(x,y,z)$ that satisfy\n\\[6x = -y = -4z\\]is another line.\n\nFind the angle between these lines, in degrees.",
  "expected": "90^\\circ",
  "char_count": 192
}


OUTPUT: 
The angle between the lines is $ \cos^{-1}\left(\frac{1}{\sqrt{3}}\right) $, which is approximately $ 54.7^\circ $.

{
  "id": "math_3_158_758",
  "type": "math",
  "prompt": "Let $a_{0} = 2$ , $a_{1} = 5$ , and $a_{2} = 8$ , and for $n > 2$ define $a_{n}$ recursively to be the remainder when $4(a_{n-1} + a_{n-2} + a_{n-3})$ is divided by $11$ . Find $a_{2018} \\cdot a_{2020} \\cdot a_{2022}$ .",
  "expected": "112",
  "char_count": 219
}


OUTPUT: 
11

{
  "id": "math_3_127_727",
  "type": "math",
  "prompt": "Two squares of a $7\\times 7$ checkerboard are painted yellow, and the rest are painted green. Two color schemes are equivalent if one can be obtained from the other by applying a rotation in the plane board. How many inequivalent color schemes are possible?",
  "expected": "300",
  "char_count": 257
}


OUTPUT: 
13

{
  "id": "math_3_131_731",
  "type": "math",
  "prompt": "Paityn has 20 red hats and 24 blue hats. Her friend Zola has 4/5 times as many red hats as she has and twice the number of blue hats. If they combine all the hats together and share them equally between themselves, calculate the number of hats each gets.",
  "expected": "Paityn has a total of 20 hats + 24 hats = <<20+24=44>>44 hats.\nThe number of red hats that Zola has is 4/5 * 20 hats = <<4/5*20=16>>16 hats\nZola also has 2 * 24 hats = <<2*24=48>>48 blue hats.\nZola has a total of 48 hats + 16 hats = <<48+16=64>>64 hats.\nWhen they combine their hats, they have 64 hats + 44 hats = <<64+44=108>>108 hats\nIf they share the hats equally, each get 108 hats / 2 people = <<108/2=54>>54 hats/person\n#### 54",
  "char_count": 254
}


OUTPUT: 
Paityn has 20 red hats and 24 blue hats.  
Zola has $ \frac{4}{5} \times 20 = 16 $ red hats and $ 2 \times 24 = 48 $ blue hats.  
Total hats = $ (20 + 24) + (16 + 48) = 44 + 64 = 108 $.  
Each gets $ \frac{108}{2} = 54 $ hats.  

**Answer: 54**

{
  "id": "math_3_94_694",
  "type": "math",
  "prompt": "Point $P_{}$ is located inside triangle $ABC$ so that angles $PAB, PBC,$ and $PCA$ are all congruent.  The sides of the triangle have lengths $AB=13, BC=14,$ and $CA=15,$ and the tangent of angle $PAB$ is $m/n,$ where $m_{}$ and $n_{}$ are relatively prime positive integers.  Find $m+n.$",
  "expected": "463",
  "char_count": 288
}


OUTPUT: 
To solve this problem, we recognize that point $ P $ inside triangle $ ABC $ such that $ \angle PAB = \angle PBC = \angle PCA $ is known as the **equal angle point** or **isogonal conjugate** of the incenter. This special point is also known as the **Nagel point** or **equal angle point**, and it has equal angles with the sides of the triangle.

Given triangle $ ABC $ with sides $ AB = 13 $, $ BC = 14 $, and $ CA = 15 $, we are to find the tangent of angle $ PAB $, which is the same as the tangent of the equal angles formed by point $ P $ with the triangle.

This is a known geometric configuration, and the tangent of the equal angle can be found using the formula:

$$
\tan(\angle PAB) = \frac{r}{R - r}
$$

where $ r $ is the inradius and $ R $ is the circumradius of triangle $ ABC $.

### Step 1: Compute the area of triangle $ ABC $

Using Heron's formula:

$$
s = \frac{13 + 14 + 15}{2} = 21
$$
$$
\text{Area} = \sqrt{s(s - a)(s - b)(s - c)} = \sqrt{21 \cdot 8 \cdot 7

{
  "id": "math_3_162_762",
  "type": "math",
  "prompt": "Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?",
  "expected": "He eats 32 from the largest pizzas because 2 x 16 = <<2*16=32>>32\nHe eats 16 from the small pizza because 2 x 8 = <<2*8=16>>16\nHe eats 48 pieces because 32 + 16 = <<32+16=48>>48\n#### 48",
  "char_count": 217
}


OUTPUT: 
24 slices

{
  "id": "math_3_181_781",
  "type": "math",
  "prompt": "Sam memorized six more digits of pi than Carlos memorized. Mina memorized six times as many digits of pi as Carlos memorized. If Mina memorized 24 digits of pi, how many digits did Sam memorize?",
  "expected": "Carlos memorized 24/6=<<24/6=4>>4 digits of pi.\nSam memorized 4+6=10 digits of pi.\n#### 10",
  "char_count": 194
}


OUTPUT: 
12

{
  "id": "math_3_145_745",
  "type": "math",
  "prompt": "Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?",
  "expected": "She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift\nShe works 10 hours a day and anything over 8 hours is eligible for overtime, so she gets 10-8 = <<10-8=2>>2 hours of overtime\nOvertime is calculated as time and a half so and she makes $18/hour so her overtime pay is 18*.5 = $<<18*.5=9.00>>9.00\nHer overtime pay is 18+9 = $<<18+9=27.00>>27.00\nHer base pay is $144.00 per 8-hour shift and she works 5 days and makes 5 * $144 = $<<144*5=720.00>>720.00\nHer overtime pay is $27.00 per hour and she works 2 hours of overtime per day and makes 27*2 = $<<27*2=54.00>>54.00 in overtime pay\n2 hours of overtime pay for 5 days means she makes 5

OUTPUT: 
$270.00

{
  "id": "math_3_108_708",
  "type": "math",
  "prompt": "Leo's assignment was divided into three parts. He finished the first part of his assignment in 25 minutes. It took him twice as long to finish the second part. If he was able to finish his assignment in 2 hours, how many minutes did Leo finish the third part of the assignment?",
  "expected": "It took Leo 25 x 2 = <<25*2=50>>50 minutes to finish the second part of the assignment.\nLeo finished the first and second parts of the assignment in 25 + 50 = <<25+50=75>>75 minutes.\nHe finished the entire assignment in 60 x 2 = <<60*2=120>>120 minutes.\nTherefore, it took Leo 120 - 75 = <<120-75=45>>45 minutes to finish the third part of the assignment.\n#### 45",
  "char_count": 277
}


OUTPUT: 
First part: 25 minutes  
Second part: 2 × 25 = 50 minutes  
Total time: 2 hours = 120 minutes  
Third part: 120 - 25 - 50 = 45 minutes  

45

{
  "id": "math_3_283_883",
  "type": "math",
  "prompt": "Half the value of $3x-9$ is $x+37$. What is the value of $x$?",
  "expected": "83",
  "char_count": 61
}


OUTPUT: 
$ x = 23 $