In [1]:
import os
import re
import json

from tqdm import tqdm
import pandas as pd
from src.agent import Agent

In [2]:
QUESTION=r"""
In a new school, $40$ percent of the students are freshmen, $30$ percent are sophomores, $20$ percent are juniors, and $10$ percent are seniors. All freshmen are required to take Latin, and $80$ percent of sophomores, $50$ percent of the juniors, and $20$ percent of the seniors elect to take Latin. The probability that a randomly chosen Latin student is a sophomore is $\\frac{m}{n}$ , where $m$ and $n$ are relatively prime positive integers. Find $m+n$ .
""".strip()

### Quick test

In [3]:
robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)

test=robotucus.react(QUESTION)
test

"Answer wasn't reach in <20 calls"

### Load dataset

In [4]:
DATA_PATH='./dataset/cse476_final_project_dev_data.json'
with open(DATA_PATH,'r') as file:
    json_file:list=json.load(file)
    dataset:list=pd.DataFrame(json_file)
dataset.head()

Unnamed: 0,input,output,domain
0,Let $ABCD$ be a convex quadrilateral with $AB ...,112,math
1,A tennis player computes her win ratio by divi...,164,math
2,What is the product of the real roots of the e...,20,math
3,"In $\triangle ABC$ , $AB= 425$ , $BC=450$ , an...",306,math
4,How many even integers between 4000 and 7000 h...,728,math


In [5]:
domains = (dataset['domain']
           .value_counts()
           .index.tolist())
domains

['common_sense', 'math', 'coding', 'future_prediction', 'planning']

In [6]:
# Create mini dataset of 50 samples

SAMPLE_SIZE=10
RANDOM_STATE=42
mini_dataset = pd.DataFrame()

for domain in domains:
    sample = (dataset.query("domain==@domain")
              .sample(n=SAMPLE_SIZE,random_state=RANDOM_STATE)
              .reset_index())
    
    mini_dataset=pd.concat([mini_dataset,sample],ignore_index=True)

print(f"Mini Dataset size: {len(mini_dataset)}")



Mini Dataset size: 50


### Inference Algorithm test

In [13]:
robotucus = Agent(
    api_key=os.getenv("OPENAI_API_KEY", "cse476"),
    api_base=os.getenv("API_BASE", "http://10.4.58.53:41701/v1"),
    model_name=os.getenv("MODEL_NAME", "bens_model"),
    temperature=0.0
)

In [14]:
questions = mini_dataset["input"]

mini_dataset['CoT'] = [robotucus.chain_of_thought(question=question) for question in tqdm(questions)]
mini_dataset['self_refine']= [robotucus.self_refine(question=question,max_calls=5) for question in tqdm(questions)]
mini_dataset['react']= [robotucus.react(question=question) for question in tqdm(questions)]
mini_dataset.to_csv("./dataset/inference_algo_results.csv",index=False)

100%|██████████| 50/50 [06:30<00:00,  7.81s/it]
100%|██████████| 50/50 [37:07<00:00, 44.56s/it]


  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 50/50 [13:44<00:00, 16.48s/it]


In [None]:
mini_dataset.head()

Unnamed: 0,index,input,output,domain,react,CoT,self_refine
10,603,How many positive whole-number divisors does 1...,9,math,196 has 9 positive whole-number divisors.,196 has 9 positive whole-number divisors.,196 has 9 positive whole-number divisors.
11,666,Expand and simplify completely: \begin{align*}...,x^3+3x-6,math,Function analysis involves examining the purpo...,$x^3 + 3x - 6$,The expression has been fully simplified and i...
12,452,Arnel had ten boxes of pencils with the same n...,Arnel shared 5 x 8 = <<5*8=40>>40 pencils with...,math,5,There are 5 pencils in each box.,no answer
13,9,"A gardener plants three maple trees, four oaks...",106,math,15,15.142857,116
14,633,Consider the geometric sequence $\frac{125}{9}...,\frac{243}{625},math,"The common ratio is 3/5, and the function repr...",$\frac{97}{250}$,The eighth term of the sequence is $\boxed{\fr...
15,626,In how many ways can $7$ people sit around a r...,144,math,576,144,576
16,496,James has a rainwater collection barrel. For ...,It rained 3+4=<<3+4=7>>7 inches\nSo he collect...,math,James made $126 from selling all the water.,$126,James made $126 by selling 105 gallons of rain...
17,409,Tina makes $18.00 an hour. If she works more ...,She works 8 hours a day for $18 per hour so sh...,math,Tina makes $990.,"$1,350.00",Tina makes $990.
18,5,"For all positive integers $x$ , let \[f(x)=\be...",511,math,2925,13,The sum of the distinct prime factors of $ m $...
19,475,Dale and Andrew had breakfast at a cafe. A sli...,The cost of Dale's toast is 2 × $1 = $<<2*1=2>...,math,£15,£15,£15


In [None]:
SYSTEM="""
You are a strict grader. Your task is to evaluate if a student's response exactly matches the correct answer.  

- If the student's response is identical to the correct answer, respond with: True  
- If the student's response differs in any way, respond with: False  

Example:  

Question: What is the capital of France?  
Correct Answer: Paris  
Student: Paris  
Response: True  

Question: What is 40 + 2?  
Correct Answer: 42  
Student: 43  
Response: False  

Always respond with only True or False, nothing else.
"""

In [56]:
mini_dataset=pd.read_csv("./dataset/inference_algo_results.csv")

In [58]:
react_scores, CoT_scores, self_refine_scores = [], [], []

for _,row in tqdm(mini_dataset.iterrows()):
    
    question, answer = row['input'], row['output']
    react, CoT, self_refine = row['react'], row['CoT'], row['self_refine']


    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {react}"

    react_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if react_score['text']:
        react_scores.append(react_score['text'])

    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {CoT}"
    CoT_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if CoT_score['text']:
        CoT_scores.append(CoT_score['text'])

    user_prompt=f"question:\n{question}\nCorrect Answer: {answer}\nStudent: {self_refine}"
    self_refine_score = robotucus.call_model(user=user_prompt,system=SYSTEM)
    if self_refine_score['text']:
        self_refine_scores.append(self_refine_score['text'])


mini_dataset['react_scores']=react_scores
mini_dataset['CoT_scores']=CoT_scores
mini_dataset['self_refine_scores']=self_refine_scores
    

50it [00:24,  2.08it/s]


In [None]:
print(mini_dataset['react_scores'].value_counts())
print(mini_dataset['CoT_scores'].value_counts())
print(mini_dataset['self_refine_scores'].value_counts())

react_scores
False    38
True     12
Name: count, dtype: int64
CoT_scores
False    42
True      8
Name: count, dtype: int64
self_refine_scores
False    44
True      6
Name: count, dtype: int64


In [73]:
mini_dataset[['CoT']].head(10)

Unnamed: 0,CoT
0,William Golding.
1,"The Bellamy Brothers had a 70s No 1 hit with ""..."
2,Film director.
3,The innovation for the car developed by Prince...
4,The name of the Emmy Award winner and two-time...
5,"The population of Lagos, Nigeria, according to..."
6,6) There is no consensus on who first said this
7,The current name of the Atlanta Mansion built ...
8,The provided context does not contain the nece...
9,"Yes, a felony jury can be sufficient for a Bun..."
