In [2]:
import os
import sys
import re
import json
from tqdm import tqdm
import pandas as pd

# Add agent.py to the sys search search path
sys.path.append(os.path.abspath('..'))

from src.agent import Agent

### Quick test

In [5]:
robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)

QUESTION=r"""
In a new school, $40$ percent of the students are freshmen, $30$ percent are sophomores, $20$ percent are juniors, and $10$ percent are seniors. All freshmen are required to take Latin, and $80$ percent of sophomores, $50$ percent of the juniors, and $20$ percent of the seniors elect to take Latin. The probability that a randomly chosen Latin student is a sophomore is $\\frac{m}{n}$ , where $m$ and $n$ are relatively prime positive integers. Find $m+n$ .
""".strip()

self_refine_answer=robotucus.self_refine(QUESTION,max_calls=20)
react_answer = robotucus.react(QUESTION,max_calls=20)
CoT_answer = robotucus.chain_of_thought(QUESTION,max_calls=20)

print(
    "Answer from each type of Inference time algorithm:"
    f"\nSelf refine: {self_refine_answer}"
    f"\nReAct Answer: {react_answer}"
    f"\nCoT Answer: {CoT_answer}"
)


Answer from each type of Inference time algorithm:
Self refine: $\boxed{25}$
ReAct Answer: To solve the problem, we assume there are 100 students in the school. 
CoT Answer: 25


### Load dataset

In [None]:
DATA_PATH='./dataset/cse476_final_project_dev_data.json'
with open(DATA_PATH,'r') as file:
    json_file:list=json.load(file)
    dataset:list=pd.DataFrame(json_file)
dataset.head()

In [None]:
domains = (dataset['domain']
           .value_counts()
           .index.tolist())
domains

In [None]:
# Create mini dataset of 50 samples

SAMPLE_SIZE=10
RANDOM_STATE=42
mini_dataset = pd.DataFrame()

for domain in domains:
    sample = (dataset.query("domain==@domain")
              .sample(n=SAMPLE_SIZE,random_state=RANDOM_STATE)
              .reset_index())
    
    mini_dataset=pd.concat([mini_dataset,sample],ignore_index=True)

print(f"Mini Dataset size: {len(mini_dataset)}")



### Inference Algorithm test

In [None]:
robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)

In [None]:
questions = mini_dataset["input"]

mini_dataset['CoT'] = [robotucus.chain_of_thought(question=question) for question in tqdm(questions)]
mini_dataset['self_refine']= [robotucus.self_refine(question=question,max_calls=5) for question in tqdm(questions)]
mini_dataset['react']= [robotucus.react(question=question) for question in tqdm(questions)]
mini_dataset.to_csv("./dataset/inference_algo_results.csv",index=False)

In [None]:
mini_dataset.head()

In [None]:
SYSTEM="""
You are a strict grader. Your task is to evaluate if a student's response exactly matches the correct answer.  

- If the student's response is identical to the correct answer, respond with: True  
- If the student's response differs in any way, respond with: False  

Example:  

Question: What is the capital of France?  
Correct Answer: Paris  
Student: Paris  
Response: True  

Question: What is 40 + 2?  
Correct Answer: 42  
Student: 43  
Response: False  

Always respond with only True or False, nothing else.
"""

In [None]:
mini_dataset=pd.read_csv("./dataset/inference_algo_results.csv")

In [None]:
react_scores, CoT_scores, self_refine_scores = [], [], []

for _,row in tqdm(mini_dataset.iterrows()):
    
    question, answer = row['input'], row['output']
    react, CoT, self_refine = row['react'], row['CoT'], row['self_refine']


    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {react}"

    react_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if react_score['text']:
        react_scores.append(react_score['text'])

    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {CoT}"
    CoT_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if CoT_score['text']:
        CoT_scores.append(CoT_score['text'])

    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {self_refine}"
    self_refine_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if self_refine_score['text']:
        self_refine_scores.append(self_refine_score['text'])


mini_dataset['react_scores']=react_scores
mini_dataset['CoT_scores']=CoT_scores
mini_dataset['self_refine_scores']=self_refine_scores
    

In [None]:
print(mini_dataset['react_scores'].value_counts())
print(mini_dataset['CoT_scores'].value_counts())
print(mini_dataset['self_refine_scores'].value_counts())

In [None]:
mini_dataset[['CoT']].head(10)