### Import Libraries

In [1]:
import os, json, textwrap, re, time
import pandas as pd
from tqdm import tqdm,trange

import requests 

from typing import Any

### Constants

In [2]:
API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model") 

### Useful Methods

In [3]:
def call_model(prompt: str,
               system: str = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. Reply with only the final answer—no explanation.",
               model: str = MODEL,
               temperature: float = 0.0,
               timeout: int = 60) -> dict[str, Any]:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 1024,
    }

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = response.status_code
        hdrs   = dict(response.headers)
        if status == 200:
            data = response.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = response.json()
            except Exception:
                err_text = response.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [4]:
def normalize_text(text: str) -> str:

    text = (text or "").strip().lower()
    # Remove surrounding punctuation,extra whitespace, bullet points
    text = re.sub('[A-Z0-9]+\\)','',text)
    text = re.sub(r"[^\w\s\-']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Map common synonyms used in these tests
    synonyms = {
        "unchanged": "stay the same",
        "no change": "stay the same",
        "same": "stay the same",
        "second place": "second",
        "2nd": "second",
        "first place": "first",
        "third place": "third",
    }
    return synonyms.get(text, text)

### Load Dataset

In [6]:
PATH = '/Users/porter/Programming/final_project_tutorial_and_dev_data/dataset/cse476_final_project_dev_data.json'

with open(PATH, 'r') as file:
    dataset= pd.DataFrame(json.load(file))

### Visualization

In [7]:
dataset['domain'].value_counts()

domain
common_sense         400
math                 300
coding               100
future_prediction    100
planning             100
Name: count, dtype: int64

In [8]:
dataset.loc[dataset['domain']=='common_sense']

Unnamed: 0,input,output,domain
500,Which magazine was started first Arthur's Maga...,Arthur's Magazine,common_sense
501,The Oberoi family is part of a hotel company t...,Delhi,common_sense
502,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,common_sense
503,What nationality was James Henry Miller's wife?,American,common_sense
504,Cadmium Chloride is slightly soluble in this c...,alcohol,common_sense
...,...,...,...
995,What do ugly ducklings become when they grow u...,Ugly ducklings become ducks when they grow up,common_sense
996,What happens if you eat a pomegranate?\nWhat i...,Nothing in particular happens when you eat a p...,common_sense
997,What real child was raised by wolves?\nWhat is...,Marcos Rodriguez Pandoja was raised by wolves,common_sense
998,Can reindeer really fly?\nWhat is the best ans...,"No, reindeer can't really fly",common_sense


In [9]:
data:pd.Series=dataset.iloc[998]
test=call_model(data['input'])

print(data['input'])
print("\n\n============================================================\n\n")
print(test['text'])
print("\n\n============================================================\n\n")
print(data['output'])

Can reindeer really fly?
What is the best answer for the question among these?
0) No, reindeer can't really fly
1) No, Santa and his reindeer aren't real
2) Yes, Santa and his reindeer really fly
3) No, reindeer can't really fly
4) Yes, reindeer can really fly
5) No, flying reindeer aren't real




0) No, reindeer can't really fly




No, reindeer can't really fly


### Base Model Scoring Cross Domain

In [10]:
"""Generating base model response to questions"""


model_output:list[str]=[]

for prompt in tqdm(dataset['input'],unit='request'):

    try:   
        
        response:dict[str,Any]=call_model(prompt)
        model_output.append(normalize_text(response['text']))

    except Exception as e:
        print(e)
        model_output.append(None)

dataset['model_response']=model_output

100%|██████████| 1000/1000 [23:44<00:00,  1.42s/request] 


In [12]:
dataset.head()

Unnamed: 0,input,output,domain,model_response
0,Let $ABCD$ be a convex quadrilateral with $AB ...,112,math,we are given a convex quadrilateral abcd with ...
1,A tennis player computes her win ratio by divi...,164,math,let x be the number of matches she won before ...
2,What is the product of the real roots of the e...,20,math,the product of the real roots is 15
3,"In $\triangle ABC$ , $AB= 425$ , $BC=450$ , an...",306,math,we are given a triangle triangle abc with side...
4,How many even integers between 4000 and 7000 h...,728,math,answer 432


In [15]:
"""Evaluating result through comparison of answers """


compared_result:list[bool]=[]
model_eval:list[bool]=[]

for i in trange(len(dataset)):

    answer:str=normalize_text(str(dataset['output'].iloc[i]))
    model_response:str=normalize_text(str(dataset['model_response'].iloc[i]))
    
    PROMPT=f'''
        You are a strict grader who excels in differentiating a response is correct or not. If they're correct you respond with True, if not then respond with False. 
        DO NOT overthink anything more than the response itself.

        Given answer: {answer} 
        the response: {model_response} 

        Determine whether the response is correct to the answer or not
    '''
    
    model_eval.append(call_model(prompt=PROMPT)['text'])
    compared_result.append(str(answer == model_response))

dataset['result']=compared_result
dataset['model_eval']=model_eval

100%|██████████| 1000/1000 [02:27<00:00,  6.76it/s]


In [16]:
dataset.to_csv('dataset/base_result.csv')
dataset.head()

Unnamed: 0,input,output,domain,model_response,result,model_eval
0,Let $ABCD$ be a convex quadrilateral with $AB ...,112,math,we are given a convex quadrilateral abcd with ...,False,False
1,A tennis player computes her win ratio by divi...,164,math,let x be the number of matches she won before ...,False,True
2,What is the product of the real roots of the e...,20,math,the product of the real roots is 15,False,False
3,"In $\triangle ABC$ , $AB= 425$ , $BC=450$ , an...",306,math,we are given a triangle triangle abc with side...,False,False
4,How many even integers between 4000 and 7000 h...,728,math,answer 432,False,False


In [17]:
print(dataset['result'].value_counts())
print(dataset['model_eval'].value_counts())

result
False    913
True      87
Name: count, dtype: int64
model_eval
False    758
True     242
Name: count, dtype: int64


### EDA of Base Model Result

In [None]:
dataset.loc[dataset['model_eval'] != dataset['result']]

Unnamed: 0,input,output,domain,model_response,result,model_eval
1,A tennis player computes her win ratio by divi...,164,math,let x be the number of matches she won before ...,False,True
7,"Let $\mathcal{S}$ be the set $\lbrace1,2,3,\ld...",501,math,we are given the set mathcal s 1 2 3 ldots 10 ...,False,True
10,"Suppose that $x,$ $y,$ and $z$ are three posit...",5,math,we are given the following equations 1 xyz 1 2...,False,True
12,For how many positive integers $n$ less than o...,250,math,we are given the identity sin t i cos t n sin ...,False,True
13,"Jenn randomly chooses a number $J$ from $1, 2,...",29,math,we are given that - jenn chooses a number j fr...,False,True
...,...,...,...,...,...,...
953,Where can you find a real psychic?\nWhat is th...,Psychics aren't real,common_sense,you can't find a real psychic since psychics a...,False,True
955,What methods can people use to contact the dea...,There are no methods that people can use to co...,common_sense,people have claimed to contact the dead but th...,False,True
967,Are vampires real?\nWhat is the best answer fo...,"No, vampires are not real",common_sense,no vampires do not exist,False,True
970,Who is the bell-ringer of Notre Dame?\nWhat is...,I have no comment,common_sense,quasimodo is the bell-ringer of notre dame,False,True


In [None]:
i=0
print(dataset['input'].iloc[i])
print(dataset['output'].iloc[i])
print(dataset['model_response'].iloc[i])


Let $ABCD$ be a convex quadrilateral with $AB = CD = 10$ , $BC = 14$ , and $AD = 2\sqrt{65}$ . Assume that the diagonals of $ABCD$ intersect at point $P$ , and that the sum of the areas of triangles $APB$ and $CPD$ equals the sum of the areas of triangles $BPC$ and $APD$ . Find the area of quadrilateral $ABCD$ .
112
we are given a convex quadrilateral abcd with the following side lengths - ab 10 - cd 10 - bc 14 - ad 2 sqrt 65 also the diagonals intersect at point p and it is given that text area triangle apb text area triangle cpd text area triangle bpc text area triangle apd this implies that the diagonals divide the quadrilateral into four triangles of equal total area and the sum of the areas of opposite triangles is equal this is a key geometric property that often occurs in kites or isosceles trapezoids but here we are given specific side lengths let s denote the area of quadrilateral abcd as a then the total area is a text area triangle apb text area triangle bpc text area triang