In [1]:
import os
import re
import json

from tqdm import tqdm
import pandas as pd
from src.agent import Agent

In [2]:
QUESTION=r"""
In a new school, $40$ percent of the students are freshmen, $30$ percent are sophomores, $20$ percent are juniors, and $10$ percent are seniors. All freshmen are required to take Latin, and $80$ percent of sophomores, $50$ percent of the juniors, and $20$ percent of the seniors elect to take Latin. The probability that a randomly chosen Latin student is a sophomore is $\\frac{m}{n}$ , where $m$ and $n$ are relatively prime positive integers. Find $m+n$ .
""".strip()

### Quick test

In [3]:
robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)

test=robotucus.chain_of_thought("what is your name")

### Load dataset

In [4]:
DATA_PATH='./dataset/cse476_final_project_dev_data.json'
with open(DATA_PATH,'r') as file:
    json_file:list=json.load(file)
    dataset:list=pd.DataFrame(json_file)
dataset.head()

Unnamed: 0,input,output,domain
0,Let $ABCD$ be a convex quadrilateral with $AB ...,112,math
1,A tennis player computes her win ratio by divi...,164,math
2,What is the product of the real roots of the e...,20,math
3,"In $\triangle ABC$ , $AB= 425$ , $BC=450$ , an...",306,math
4,How many even integers between 4000 and 7000 h...,728,math


In [5]:
domains = (dataset['domain']
           .value_counts()
           .index.tolist())
domains

['common_sense', 'math', 'coding', 'future_prediction', 'planning']

In [6]:
# Create mini dataset of 50 samples

SAMPLE_SIZE=10
RANDOM_STATE=42
mini_dataset = pd.DataFrame()

for domain in domains:
    sample = (dataset.query("domain==@domain")
              .sample(n=SAMPLE_SIZE,random_state=RANDOM_STATE)
              .reset_index())
    
    mini_dataset=pd.concat([mini_dataset,sample],ignore_index=True)

print(f"Mini Dataset size: {len(mini_dataset)}")



Mini Dataset size: 50


### Inference Algorithm test

In [7]:
questions = mini_dataset["input"]

robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)


# mini_dataset['CoT'] = [robotucus.chain_of_thought(question=question) for question in tqdm(questions)]b

In [9]:
mini_dataset['self_refine']= [robotucus.self_refine(question=question,max_calls=5) for question in tqdm(questions)]

100%|██████████| 50/50 [34:02<00:00, 40.85s/it]


In [None]:
# mini_dataset['react']= [robotucus.chain_of_thought(question=question) for question in tqdm(questions)]

In [10]:
mini_dataset.to_csv("./dataset/inference_algo_results.csv",index=False)

In [12]:
mini_dataset.query("domain=='math'")

Unnamed: 0,index,input,output,domain,self_refine
10,603,How many positive whole-number divisors does 1...,9,math,"To evaluate the correctness of the statement ""..."
11,666,Expand and simplify completely: \begin{align*}...,x^3+3x-6,math,"To evaluate the correctness of the answer, we ..."
12,452,Arnel had ten boxes of pencils with the same n...,Arnel shared 5 x 8 = <<5*8=40>>40 pencils with...,math,### Review and Analysis:\n\nLet's go through t...
13,9,"A gardener plants three maple trees, four oaks...",106,math,"To evaluate the answer ""116"" without additiona..."
14,633,Consider the geometric sequence $\frac{125}{9}...,\frac{243}{625},math,"To evaluate the answer ""The eighth term of the..."
15,626,In how many ways can $7$ people sit around a r...,144,math,"To evaluate the answer ""576"" without additiona..."
16,496,James has a rainwater collection barrel. For ...,It rained 3+4=<<3+4=7>>7 inches\nSo he collect...,math,Let's analyze the answer step by step to ident...
17,409,Tina makes $18.00 an hour. If she works more ...,She works 8 hours a day for $18 per hour so sh...,math,"To evaluate the answer ""Tina makes $990,"" we n..."
18,5,"For all positive integers $x$ , let \[f(x)=\be...",511,math,"To evaluate the answer ""The sum of the distinc..."
19,475,Dale and Andrew had breakfast at a cafe. A sli...,The cost of Dale's toast is 2 × $1 = $<<2*1=2>...,math,"To evaluate the answer ""£15"" as a response to ..."
