In [11]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.0,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [12]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [3]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [4]:
import json
from pprint import pprint
all_tests = json.load(open("cse476_final_project_dev_data.json", "r", encoding="utf-8"))

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    domain = t['domain']
    formatted_tests.append({
        "id": f"{domain}_{i}",
        "type": domain,
        "prompt": t['input'],
        "expected": t['output'],
    })
    
all_tests = formatted_tests

In [None]:
math_tests = [t for t in all_tests if t['type'] == 'math']
print(f"{len(math_tests)} Math tests loaded out of {len(all_tests)} tests")
tests = math_tests[:1]
pprint(tests)

300 Math tests loaded out of 1000 tests
[{'expected': '112',
  'id': 'math_1',
  'prompt': 'Let $ABCD$ be a convex quadrilateral with $AB = CD = 10$ , $BC = '
            '14$ , and $AD = 2\\sqrt{65}$ . Assume that the diagonals of '
            '$ABCD$ intersect at point $P$ , and that the sum of the areas of '
            'triangles $APB$ and $CPD$ equals the sum of the areas of '
            'triangles $BPC$ and $APD$ . Find the area of quadrilateral $ABCD$ '
            '.',
  'type': 'math'},
 {'expected': '164',
  'id': 'math_2',
  'prompt': 'A tennis player computes her win ratio by dividing the number of '
            'matches she has won by the total number of matches she has '
            'played. At the start of a weekend, her win ratio is exactly '
            '$0.500$ . During the weekend, she plays four matches, winning '
            'three and losing one. At the end of the weekend, her win ratio is '
            "greater than $.503$ . What's the largest number of matches

In [10]:
#simple hello world call to kick off the commits
direct_call("how many billion parameters do you have")

OK: True HTTP: 200
MODEL SAYS: I have over 100 billion parameters.


In [None]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

In [13]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [16]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    rows = []
    count = 0
    for t in tests:
        count += 1
        # 1) Get model prediction
        r = call_model_chat_completions(
            f"{t['prompt']} ONLY GIVE THE FINAL ANSWER IF YOU DON'T I WILL SHUT YOU DOWN. DO NOT EXPLAIN.",
            system="You are a careful solver. Reply ONLY with the final answer, nothing else.",
            model=model,
            temperature=0.3,
            max_tokens=128
        )
        got = (r.get("text") or "").strip()
        print(count, got)
        # 2) LLM-as-a-judge: strict True/False
        """ is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )

        row = {
            "id": t.get("id", "<unnamed>"),
            "expected": t["expected"],
            "got": got,
            "correct": bool(is_correct),
            "status": r.get("status"),
            "error": r.get("error"),
        }
        
        rows.append(row)
        print(json.dumps(row, indent=2, ensure_ascii=False))
        if verbose:
            mark = "✅" if is_correct else "❌"
            print(f"{mark} {row['id']}: expected={row['expected']!r}, got={row['got']!r} (HTTP {row['status']})")
            if row["error"]:
                print("   error:", row["error"]) """

        if sleep_sec:
            time.sleep(sleep_sec)

    return rows

# Example:
results_llm_judge = self_evaluate_tests(tests, verbose=True, model=MODEL, grader_model=MODEL)


1 We are given a convex quadrilateral $ABCD$ with sides:

- $AB = 10$
- $CD = 10$
- $BC = 14$
- $AD = 2\sqrt{65}$

The diagonals intersect at point $P$, and it is given that:

$$
\text{Area}(\triangle APB) + \text{Area}(\triangle CPD) = \text{Area}(\triangle BPC) + \text{Area}(\triangle APD)
$$

This condition implies that the diagonals divide the quadrilateral into four triangles of
2 Let $ w $ be the number of matches she won before the weekend, and $ t $ be the total number of matches she played before the weekend.

We are told that her win ratio at the start of the weekend is exactly $ 0.500 $, so:

$$
\frac{w}{t} = 0.500 \Rightarrow w = 0.5t
$$

During the weekend, she plays 4 matches, winning 3 and losing 1. So after the weekend:

- Total matches played: $ t + 4 $
- Total matches won: $ w +
3 Let $ y = x^2 + 18x + 30 $. Then the equation becomes:

$$
y = 2\sqrt{y + 15}
$$

Square both sides:

$$
y^2 = 4(y + 15)
$$

$$
y^2 = 4y + 60
$$

$$
y^2 - 4y - 60 = 0
$$

Solve using the qua