In [162]:
import json
import os
import re
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

os.environ["OPENAI_API_KEY"] = "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"

In [242]:
def read_jsonl(path: str):
    with open(path) as fh:
        return [json.loads(line) for line in fh.readlines() if line]
    
pattern = r'[a-zA-Z]\)'
def extract_ai_answer(completion):
    try:
        preprocess_res = completion.lower().split('answer:')[-1].strip()
        
        if ')' in preprocess_res:
            matches = re.findall(pattern, preprocess_res)
            return matches[0].split(')')[0].lower().strip()
                    
        
        if '.' in preprocess_res:
            preprocess_res = preprocess_res.replace('.', "").strip()

        return preprocess_res
    except Exception as e:
        print(e)
        return None
    

In [164]:
test_data = read_jsonl('datasets/AQuA/test.json')
dev_data = read_jsonl('datasets/AQuA/dev.json')

In [165]:
len(test_data)

254

In [166]:
len(dev_data)

254

In [229]:
def run_experiment(nr_examples, model_name, dataset, strategy):
    
    prefix_dic = {'cot' : """You are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. Follow the examples below to output the solution:\n\n
                           """,
                  'standard' : """"You are willing to solve algebraic word problems with multiple choice questions. Choose only one of the given options as the final answer. Follow the examples below and generate the answer using the format of these examples:\n\n
                               """
                 }
    
    prefix = prefix_dic[strategy]
    count_correct_answers = 0
    dataframes_list = []
    list_contexts = []
    accuracy_results = []

    for seed in [100, 200]:    
        random.seed(seed)

        selected_examples = random.sample(dataset, nr_examples)
        context = ''

        for prompt_example in selected_examples:
            context += f"Question: {prompt_example['question']}\n"
            context += f"Options: {prompt_example['options']}\n"
            
            if strategy == 'cot':
                context += f"Rationale: {prompt_example['rationale']}\n"

            context += f"Answer: {prompt_example['correct']}\n\n"

        list_contexts.append(context)    
   
    for i, context in enumerate(list_contexts):
        print(f'Context {i} \n\n')
        count_correct_answers = 0
        df = pd.DataFrame()

        for example in dataset:
            options_answer = f"\nOptions : {example['options']}\nRationale: \nAnswer: "
            suffix = """\n\nQuestion : {question}""" + options_answer
            template = prefix + context + suffix
            prompt = PromptTemplate(input_variables=["question"], template=template)
            llm = OpenAI(model_name=model_name, temperature=0)
            llm_chain = LLMChain(prompt=prompt, llm=llm, verbose=True)
            model_answer = llm_chain.run(example['question'])
            extracted_ai_answer = extract_ai_answer(model_answer)
            extracted_true_answer = example['correct'].lower()
            print(model_answer)
            if extracted_ai_answer == extracted_true_answer:
                count_correct_answers += 1
            
            row_dic = {'question' : [example['question']],
                       'options' : [example['options']],
                       'true_answer' : [extracted_true_answer],
                       'ai_answer' : [extracted_ai_answer],
                       'ai_completion' : [model_answer]
                       }
            
            if strategy == 'cot':
                row_dic['rationale'] = [model_answer.lower().split('answer:')[0]]
                
            df = pd.concat([df, pd.DataFrame(row_dic)], ignore_index=False)
        accuracy_results.append(count_correct_answers/len(dataset)
        dataframes_list.append(df)
    
    return accuracy_results, dataframes_list
    

In [212]:
res1, res2 = run_experiment(2, 'gpt-3.5-turbo', dev_data, strategy='standard')

Context 0 




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m"You are willing to solve algebraic word problems with multiple choice questions. Choose only one of the given options as the final answer. Follow the examples below and generate the answer using the format of these examples:


                               Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 on the third day and spends Rs. 15 on the fourth day. If he continues to save like this, how soon will he have Rs. 60 in hand?
Options: ['A)on the 17th day', 'B)on the 27th day', 'C)on the 30th day', 'D)on the 40th day', 'E)on the 44th day']
Answer: A

Question: If 300 billion French francs is equivalent to 50 billion United States dollars, which of the following expressions represents the number of United States dollars equivalent to f francs?
Options: ['A)f/7', 'B)f/6', 'C)f/8', 'D)f/10', 'E)10f']
Answer: B



Question : Three birds are




[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m"You are willing to solve algebraic word problems with multiple choice questions. Choose only one of the given options as the final answer. Follow the examples below and generate the answer using the format of these examples:


                               Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 on the third day and spends Rs. 15 on the fourth day. If he continues to save like this, how soon will he have Rs. 60 in hand?
Options: ['A)on the 17th day', 'B)on the 27th day', 'C)on the 30th day', 'D)on the 40th day', 'E)on the 44th day']
Answer: A

Question: If 300 billion French francs is equivalent to 50 billion United States dollars, which of the following expressions represents the number of United States dollars equivalent to f francs?
Options: ['A)f/7', 'B)f/6', 'C)f/8', 'D)f/10', 'E)10f']
Answer: B



Question : A


[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m"You are willing to solve algebraic word problems with multiple choice questions. Choose only one of the given options as the final answer. Follow the examples below and generate the answer using the format of these examples:


                               Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 on the third day and spends Rs. 15 on the fourth day. If he continues to save like this, how soon will he have Rs. 60 in hand?
Options: ['A)on the 17th day', 'B)on the 27th day', 'C)on the 30th day', 'D)on the 40th day', 'E)on the 44th day']
Answer: A

Question: If 300 billion French francs is equivalent to 50 billion United States dollars, which of the following expressions represents the number of United States dollars equivalent to f francs?
Options: ['A)f/7', 'B)f/6', 'C)f/8', 'D)f/10', 'E)10f']
Answer: B



Question : A


[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m"You are willing to solve algebraic word problems with multiple choice questions. Choose only one of the given options as the final answer. Follow the examples below and generate the answer using the format of these examples:


                               Question: In how many arrangements can you arrange 3 green cones and 3 blue cones in a line along the boundary of football ground if no two cones of the same color are to be adjacent?
Options: ['A)6', 'B)12', 'C)36', 'D)72', 'E)144']
Answer: D

Question: John spent 72% of his budget on fruits ( including apples) , and 36% of his budget on apples. What percent of the money spent on Fruits were not spent on apples.
Options: ['A)16%', 'B)25%', 'C)32%', 'D)49%', 'E)50 %']
Answer: E



Question : Kirk sells cars. On two sales, Kirk has received commissions of $850, and $705, and he has one additional sale pending. If Kirk is to re

In [213]:
res1

[0.2, 0.3]

In [214]:
res2[0]

Unnamed: 0,question,options,true_answer,ai_answer
0,Three birds are flying at a fast rate of 900 k...,"[A)32400, B)6000, C)600, D)60000, E)10]",a,c
0,A ship is leaving a port. It takes 240 seconds...,"[A)100 m, B)150 m, C)200 m, D)250 m, E)300 m]",d,c
0,A rectangular piece of cloth 2 feet wide was c...,"[A)6, B)18, C)24, D)36, E)48]",c,c
0,"In the xy-coordinate plane, which of the follo...","[A)(1,1), B)(0,3), C)(2,0), D)(3,6), E)(6,3)]",b,c
0,A travel company wants to charter a plane to t...,"[A)7, B)9, C)13, D)27, E)45]",c,b
0,"Kirk sells cars. On two sales, Kirk has receiv...","[A)$895, B)$905, C)$910, D)$915, E)$945]",b,d
0,A group of 5 friends were to contribute equall...,"[A)$203, B)$202, C)$209, D)$205, E)$204]",e,d
0,"Let A, B and C denote the vertices of a triang...","[A)5.5, B)6, C)7, D)8, E)8.25]",b,c
0,ABC company pays an average of $120 per vehicl...,"[A)300, B)420, C)912, D)1340, E)1500]",d,c
0,Solution A has 5% salt concentration and remai...,"[A)30 litres, B)27 litres, C)25 litres, D)22.5...",d,d


In [215]:
res2[1]

Unnamed: 0,question,options,true_answer,ai_answer
0,Three birds are flying at a fast rate of 900 k...,"[A)32400, B)6000, C)600, D)60000, E)10]",a,c
0,A ship is leaving a port. It takes 240 seconds...,"[A)100 m, B)150 m, C)200 m, D)250 m, E)300 m]",d,c
0,A rectangular piece of cloth 2 feet wide was c...,"[A)6, B)18, C)24, D)36, E)48]",c,c
0,"In the xy-coordinate plane, which of the follo...","[A)(1,1), B)(0,3), C)(2,0), D)(3,6), E)(6,3)]",b,c
0,A travel company wants to charter a plane to t...,"[A)7, B)9, C)13, D)27, E)45]",c,b
0,"Kirk sells cars. On two sales, Kirk has receiv...","[A)$895, B)$905, C)$910, D)$915, E)$945]",b,b
0,A group of 5 friends were to contribute equall...,"[A)$203, B)$202, C)$209, D)$205, E)$204]",e,d
0,"Let A, B and C denote the vertices of a triang...","[A)5.5, B)6, C)7, D)8, E)8.25]",b,b
0,ABC company pays an average of $120 per vehicl...,"[A)300, B)420, C)912, D)1340, E)1500]",d,b
0,Solution A has 5% salt concentration and remai...,"[A)30 litres, B)27 litres, C)25 litres, D)22.5...",d,b


In [230]:
res1, res2 = run_experiment(2, 'gpt-3.5-turbo', dev_data, strategy='cot')

Context 0 




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. Follow the examples below to output the solution:


                           Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 on the third day and spends Rs. 15 on the fourth day. If he continues to save like this, how soon will he have Rs. 60 in hand?
Options: ['A)on the 17th day', 'B)on the 27th day', 'C)on the 30th day', 'D)on the 40th day', 'E)on the 44th day']
Rationale: For every 2 days, he saves rs.5.
After 16 days, the savings become rs.40.
on the 17th day, he earns rs.20
so 40+20=rs.60.
ANSWER:A




[1m> Finished chain.[0m
The speed of the birds in miles per hour is 900 km/h * 0.6 miles/km = 540 miles/hour.
To convert miles per hour to miles per minute, we divide by 60: 540 miles/hour / 60 minutes/hour = 9 miles/minute.
Therefore, the answer is E) 10, which is the closest option to 9. Answer: E


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. Follow the examples below to output the solution:


                           Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 on the third day and spends Rs. 15 on the fourth day. If he continues to save like this, how


[1m> Finished chain.[0m
To make a profit, the total revenue from ticket sales must exceed the cost of chartering the plane.
Total revenue = ticket price x number of passengers
Total cost = charter cost + (ticket price x number of current passengers)
Let's set up an equation to solve for the number of additional passengers needed:
200x > 5000 + 12(200)
200x > 8600
x > 43
Since we can't have a fraction of a passenger, we round up to the nearest whole number.
43 + 12 = 55
55 passengers are needed to make a profit, and 12 have already signed up, so we need 43 more.
Answer: D


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answe


[1m> Finished chain.[0m
Let the total cost of garage parking fees for 5 vehicles be x.
The total cost of outdoor parking fees for 3 vehicles is 3*$120=$360.
The total cost of parking for 8 vehicles is 8*$240=$1920.
So, the total cost of garage parking fees for 5 vehicles is x=$1920-$360=$1560.
Therefore, ABC pays $1560/5=$312 per month in garage parking fees for its vehicles.
Answer: A)300


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. Follow the examples below to output the solution:


                           Question: A man earns RS. 20 on the first day and spends Rs. 15 on the next day. He again earns Rs. 20 


[1m> Finished chain.[0m
Substitute each point into the equation kx + 2y = 6:
A) k(1) + 2(1) = k + 2
B) k(0) + 2(3) = 6
C) k(2) + 2(0) = 2k
D) k(3) + 2(6) = 6k + 12
E) k(6) + 2(3) = 6k + 6
For all of these points to lie on the line for every possible value of k, the expressions above must be equal to 6. 
Solving for each point, we get:
A) k + 2 = 6 --> k = 4
B) 6 = 6 (already satisfies the equation)
C) 2k = 6 --> k = 3
D) 6k + 12 = 6 --> k = -1
E) 6k + 6 = 6 --> k = 0
Therefore, the only point that must lie on the line for every possible value of k is (0,3).
Answer: B.


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. 


[1m> Finished chain.[0m
Let x be the amount paid per month for garage parking fees for each of the five vehicles.
The total amount paid for parking each month is (3*$120)+(5*x)=$360+5x.
The average amount paid per vehicle overall is $240, so (360+5x)/8=$240.
Solving for x, we get x=$300.
Answer: A.


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are willing to solve algebraic word problems with multiple choice questions. First decompose the problem into intermediate reasoning steps, then solve and explain each intermediate step by generating the rationale. Then choose the final answer to be only one of the given options. The output should include the rationale and the answer. Follow the examples below to output the solution:


                           Question: In how many arrangements can you arrange 3 green cones and 3 blue cones in a line along the boundary of football ground if no two cones of the same color are to be adjacent?
Options: ['A