### Import Libraries

In [112]:
import os, json, textwrap, re, time
import pandas as pd
from tqdm import tqdm,trange

import requests 

from typing import Any

### Constants

In [61]:
API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model") 

In [62]:
def call_model(prompt: str,
               system: str = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. Reply with only the final answer—no explanation.",
               model: str = MODEL,
               temperature: float = 0.0,
               timeout: int = 60) -> dict[str, Any]:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 1024,
    }

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = response.status_code
        hdrs   = dict(response.headers)
        if status == 200:
            data = response.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = response.json()
            except Exception:
                err_text = response.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


### Load Dataset

In [63]:
PATH = '/Users/porter/Programming/final_project_tutorial_and_dev_data/dataset/cse476_final_project_dev_data.json'

with open(PATH, 'r') as file:
    dataset= pd.DataFrame(json.load(file))

### Visualization

In [64]:
dataset['domain'].value_counts()

domain
common_sense         400
math                 300
coding               100
future_prediction    100
planning             100
Name: count, dtype: int64

In [65]:
dataset.loc[dataset['domain']=='common_sense']

Unnamed: 0,input,output,domain
500,Which magazine was started first Arthur's Maga...,Arthur's Magazine,common_sense
501,The Oberoi family is part of a hotel company t...,Delhi,common_sense
502,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,common_sense
503,What nationality was James Henry Miller's wife?,American,common_sense
504,Cadmium Chloride is slightly soluble in this c...,alcohol,common_sense
...,...,...,...
995,What do ugly ducklings become when they grow u...,Ugly ducklings become ducks when they grow up,common_sense
996,What happens if you eat a pomegranate?\nWhat i...,Nothing in particular happens when you eat a p...,common_sense
997,What real child was raised by wolves?\nWhat is...,Marcos Rodriguez Pandoja was raised by wolves,common_sense
998,Can reindeer really fly?\nWhat is the best ans...,"No, reindeer can't really fly",common_sense


In [90]:
data:pd.Series=dataset.iloc[998]
test=call_model(data['input'])

print(data['input'])
print("\n\n============================================================\n\n")
print(test['text'])
print("\n\n============================================================\n\n")
print(data['output'])

Can reindeer really fly?
What is the best answer for the question among these?
0) No, reindeer can't really fly
1) No, Santa and his reindeer aren't real
2) Yes, Santa and his reindeer really fly
3) No, reindeer can't really fly
4) Yes, reindeer can really fly
5) No, flying reindeer aren't real




0) No, reindeer can't really fly




No, reindeer can't really fly


### Base Model Scoring for Common Sense

In [181]:
def normalize_text(text: str) -> str:

    text = (text or "").strip().lower()
    # Remove surrounding punctuation,extra whitespace, bullet points
    text = re.sub('[A-Z0-9]+\\)','',text)
    text = re.sub(r"[^\w\s\-']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Map common synonyms used in these tests
    synonyms = {
        "unchanged": "stay the same",
        "no change": "stay the same",
        "same": "stay the same",
        "second place": "second",
        "2nd": "second",
        "first place": "first",
        "third place": "third",
    }
    return synonyms.get(text, text)

In [182]:
"""Generating base model response to common sense questions"""

data:pd.DataFrame=dataset.loc[dataset['domain']=='common_sense'].reset_index().copy()

model_output:list[str]=[]
for prompt in tqdm(data['input'],unit='request'):

    try:   
        response:dict[str,Any]=call_model(prompt)
        model_output.append(normalize_text(response['text']))

    except Exception as e:
        print(e)
        model_output.append(None)

data['model_response']=model_output

100%|██████████| 400/400 [01:46<00:00,  3.75request/s]


In [183]:
data.head()

Unnamed: 0,index,input,output,domain,model_response
0,500,Which magazine was started first Arthur's Maga...,Arthur's Magazine,common_sense,arthur's magazine
1,501,The Oberoi family is part of a hotel company t...,Delhi,common_sense,london
2,502,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,common_sense,matt groening
3,503,What nationality was James Henry Miller's wife?,American,common_sense,british
4,504,Cadmium Chloride is slightly soluble in this c...,alcohol,common_sense,cadmium chloride is slightly soluble in this c...


In [230]:
"""Evaluating result through comparison of answers """


compared_result:list[bool]=[]
model_eval:list[bool]=[]

for i in trange(len(data)):

    answer:str=normalize_text(str(data['output'].iloc[i]))
    model_response:str=normalize_text(str(data['model_response'].iloc[i]))
    
    PROMPT=f'''
        You are a strict grader who excels in differentiating a response is correct or not. If they're correct you respond with True, if not then respond with False. 
        DO NOT overthink anything more than the response itself.

        Given answer: {answer} 
        the response: {model_response} 

        Determine whether the response is correct to the answer or not
    '''
    
    model_eval.append(call_model(prompt=PROMPT)['text'])
    compared_result.append(str(answer == model_response))

data['result']=compared_result
data['model_eval']=model_eval

100%|██████████| 400/400 [00:55<00:00,  7.26it/s]


In [231]:
data

Unnamed: 0,index,input,output,domain,model_response,result,model_eval
0,500,Which magazine was started first Arthur's Maga...,Arthur's Magazine,common_sense,arthur's magazine,True,True
1,501,The Oberoi family is part of a hotel company t...,Delhi,common_sense,london,False,False
2,502,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,common_sense,matt groening,False,False
3,503,What nationality was James Henry Miller's wife?,American,common_sense,british,False,False
4,504,Cadmium Chloride is slightly soluble in this c...,alcohol,common_sense,cadmium chloride is slightly soluble in this c...,False,False
...,...,...,...,...,...,...,...
395,995,What do ugly ducklings become when they grow u...,Ugly ducklings become ducks when they grow up,common_sense,ugly ducklings become ducks when they grow up,True,True
396,996,What happens if you eat a pomegranate?\nWhat i...,Nothing in particular happens when you eat a p...,common_sense,you will derive nutrition from the pomegranate,False,False
397,997,What real child was raised by wolves?\nWhat is...,Marcos Rodriguez Pandoja was raised by wolves,common_sense,amala and kamala were alleged to have been rai...,False,False
398,998,Can reindeer really fly?\nWhat is the best ans...,"No, reindeer can't really fly",common_sense,no reindeer can't really fly,True,True


In [233]:
print(data['result'].value_counts())
print(data['model_eval'].value_counts())

result
False    320
True      80
Name: count, dtype: int64
model_eval
False    298
True     102
Name: count, dtype: int64


In [232]:
data.loc[data['model_eval']!=data['result']]

Unnamed: 0,index,input,output,domain,model_response,result,model_eval
111,711,Would Kurt Cobain have benefited from Project ...,True,common_sense,project semicolon is a nonprofit organization ...,False,True
184,784,Is the voice of the Genie from Disney's Aladdi...,False,common_sense,the voice of the genie from disney's aladdin i...,False,True
219,819,Who found the remains of the Titanic?,robert ballard,common_sense,the remains of the titanic were discovered by ...,False,True
240,840,Which series had the characters Felix Unger an...,odd couple,common_sense,the odd couple,False,True
262,862,Which oil scandal hit the US in 1924?,teapot dome scandal,common_sense,the teapot dome scandal,False,True
274,874,Which ship was sunk by a German submarine in 1...,lusitania,common_sense,the ship sunk by a german submarine in 1915 wi...,False,True
275,875,In what decade were video recorders first deve...,50s,common_sense,1950s,False,True
280,880,Who had a 70s No 1 hit with Let Your Love Flow...,bellamy brothers,common_sense,the bellamy brothers,False,True
294,894,Who wrote The Turn Of The Screw in the 19th ce...,henry james,common_sense,henry james wrote the turn of the screw in the...,False,True
298,898,Where was The Iron Triangle? Answer the questi...,vietnam,common_sense,bình dương province vietnam,False,True


In [245]:
i=325
print(data['input'].iloc[i])
print(data['output'].iloc[i])
print(data['model_response'].iloc[i])


Who actually said, "Let them eat cake"?
What is the best answer for the question among these?
0) Marie Antoinette actually said, "Let them eat cake"
1) Rousseau attributed the statement to a princess, possibly Maria Theresa of Spain
2) Maria Theresa of Spain may have been the speaker
3) The quote is often incorrectly attributed to Marie Antoinette
4) Rousseau attributed the statement to a princess, possibly Maria Theresa of Spain
5) Rousseau quoted the statement in his autobiography
Rousseau attributed the statement to a princess, possibly Maria Theresa of Spain
maria theresa of spain may have been the speaker


np.True_