In [5]:
import pandas as pd

step = pd.read_csv('./datasets_human_readable/step_test.csv', encoding='ISO-8859-1')
step['label'] = step['label'].map({0:'A', 1:'B',2:"C",3:"D"})
step


Unnamed: 0,goal,step0,step1,step2,step3,label,Human Evaluation,Noteworthy
0,Solve an Algebraic Expression,Learn how to observe your own thoughts.,Perform a waste audit.,Know the order of operations.,Find your loan information.,C,C,
1,Fake Food Poisoning,Go to a historical preservation.,Place time restrictions on impulsive thoughts.,Decide on the background or the foreground.,Invent a dicey dining situation.,D,D,
2,Prevent Toxic Shock Syndrome,Draw lines at the bottom loop and then they wi...,Follow instructions carefully when using vagin...,Spot test your blood on a dirty shirt to see h...,Place the wine glass upside down on a soft tow...,B,C,
3,Report a School for Discriminatory Discipline,Consider filing an institutional grievance.,Double check all payment amounts.,Copy and mail your form.,Identify a suspected fraudulent mortgage.,A,A,
4,List for Sale by Owner Using MLS Listing,Learning about more than.,Know what a payment gateway is.,Know what a finance charge is.,Know what the Multiple Listing Service system is.,D,D,
...,...,...,...,...,...,...,...,...
2245,Figure Out a Song by Ear,Identify the tonic (I) chord.,Use the basic accounting equation to make a ba...,Look at day two so the amount that starts doub...,Upload a video of your project if you have one.,A,,
2246,Stay Focused on God,Look after yourself physically.,Discourage your friend from contacting the ex.,Have and seek respect.,Make time for your faith every day.,D,,
2247,Save Money for Something Big As a Teen,Do some research on whatever country you are i...,Do an internship with Homeland Security if you...,Look for work in mental health clinics and psy...,Get a job (if you are old enough).,D,,
2248,Move to Switzerland,Ask veterans you know to share their experiences.,Take on shopping at the famous Union Station.,Convert your currency into Swiss francs.,Determine your characterï¿½ï¿½ï¿½s ï¿½ï¿½ï¿½wo...,C,,


# Setup Common Functions

In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.enum import EnumOutputParser
from langchain.output_parsers.fix import OutputFixingParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain.llms import Ollama
from tqdm import tqdm 
import time 
from enum import Enum
from langchain.globals import set_verbose

set_verbose(True)
    

class ChainManager:
    def __init__(self):
        self.prompt = PromptTemplate.from_template("Tell me a short joke about {input}")
        self.output_parser = StrOutputParser()
        self.df = step
#         self.model_list = ["mistral"]
        self.model_list = ["llama2","mistral", "orca-mini:7b", "qwen:7b"]
        self.logs = []

    def run_without_classifier (self, inputs, model_name, verbose, output_file_name=""):
        chain = (
            self.prompt
            | Ollama(model=model_name)
            | self.output_parser)
        output = chain.invoke(inputs)
        full_prompt = self.prompt.format(goal=inputs["goal"], step0=inputs["step0"],step1=inputs["step1"],step2=inputs["step2"], step3=inputs["step3"])

        if verbose:
            print("----------------------------------------------------------------------")
            print(f"model_name: {model_name}")
            print(f"prompt: {full_prompt}")
            print(f"output: {output}")
        else:
            self.write_string_to_buffer("----------------------------------------------------------------------")
            self.write_string_to_buffer(f"model_name: {model_name}")
            self.write_string_to_buffer(f"prompt: {full_prompt}")
            self.write_string_to_buffer(f"output: {output}")
            
        return (output.strip()[0].upper(), 1)

    def run_single_query(self, inputs, model_name, verbose, output_file_name=""):
        full_prompt = self.prompt.format(goal=inputs["goal"], step0=inputs["step0"],step1=inputs["step1"],step2=inputs["step2"], step3=inputs["step3"])
        if verbose:
            print("----------------------------------------------------------------------")
            print(f"model_name: {model_name}")
            print(f"prompt: {full_prompt}")
        else:
            self.write_string_to_buffer("----------------------------------------------------------------------")
            self.write_string_to_buffer(f"model_name: {model_name}")
            self.write_string_to_buffer(f"prompt: {full_prompt}")
        
#         generated_tutorial = self.run_information_retrieval(inputs)
        
        chain = (
            self.prompt
            | Ollama(model=model_name)
            | self.output_parser)

        chain_of_thought = chain.invoke(inputs)
        classifier_output = self.run_retry_classifier(chain_of_thought, 3, verbose)
        final_answer = classifier_output[0]
        num_retries = classifier_output[1]
        
        if verbose:
            print(f"Classifier finished...\n")
            print(f"chain_of_thought: {chain_of_thought}\n")
            print(f"final_answer: {final_answer}")
            print(f"correct_answer: {inputs['label']}")
        else:
            self.write_string_to_buffer(f"classifier finished....\n")
            self.write_string_to_buffer(f"chain_of_thought: {chain_of_thought}\n")
            self.write_string_to_buffer(f"final_answer: {final_answer}")
            self.write_string_to_buffer(f"correct_answer: {inputs['label']}")
            self.write_buffer_to_file(output_file_name)
        return classifier_output
    
    def run_batch_query(self, verbose, batch_size, output_file_name=""):
        self.df = self.df.iloc[0:batch_size].copy()
        input_list = self.df.to_dict('records')
        if output_file_name != "":
            self.clear_existing_file(output_file_name)
        for model_name in self.model_list:
            results = []
            num_retries = []
            for item in tqdm(input_list, desc="Processing queries"):
#                     result = self.run_single_query(item, model_name, verbose, output_file_name) 
                    result = self.run_single_self_review(item, model_name, verbose, output_file_name) 
#                     result = self.run_without_classifier(item, model_name, verbose, output_file_name) 
                    results.append(result[0])
                    num_retries.append(result[1])
                    
            self.df[model_name] = results 
            self.df[model_name+"_retries"] = num_retries 

    def evaluate_order(self):
        for model_name in self.model_list:
            binary_results = self.df[model_name].str.strip().str[0]
            correct_predictions = (self.df['label'] == binary_results).sum()
            total_predictions = len(self.df)
            accuracy = correct_predictions / total_predictions
            print(f"{model_name}: {accuracy}")
        
    
    def verify_output(self, mcq_choice):
        if mcq_choice == "A" or mcq_choice == "B" or mcq_choice == "C" or mcq_choice == "D":
            return True
        else: 
            return False        
        
    def run_classifier(self, initial_answer):
        
        classifier_prompt = PromptTemplate.from_template("""
        You are recieving an explanation from a language model about its choice for an mcq question.
        You are a mcq classifier. You are to select the answer based on the explanation provided.
        You are not to explain or mention anything other than provide the answer for the mcq.
        If not enough information, randomly select one.
        Please do not give me anything other than one letter thanks!
        MCQ Choices: A, B, C, D            

        Explanation: {initial_answer}
        Answer:
        """)
        
        chain = (
            classifier_prompt
            | Ollama(model="mistral")
            | self.output_parser
            
        )
        output = chain.invoke({"initial_answer": initial_answer})
        return output
    
    def run_retry_classifier(self, initial_answer, max_tries, verbose, output_file_name=""):
        if verbose:
            print("Running classifier....")
        else:
            self.write_string_to_buffer("Running classifier....")

        mcq_choice = 0
        for i in range(max_tries):
            classifier_output = self.run_classifier(initial_answer)
            if verbose:
                print(f"retry_classifier_{i+1}: {classifier_output}")
            else:
                self.write_string_to_buffer(f"retry_classifier_{i+1}: {classifier_output}")
            mcq_choice = classifier_output.strip()[0].upper()
            if self.verify_output(mcq_choice):
                return (mcq_choice, i+1)
        return (mcq_choice, max_tries)
    
    def run_self_review(self, question, answer, model):
        info_retrieval_prompt = PromptTemplate.from_template("""
        Question: {question}
        
        Previous answer: {answer}
        
        Review your previous answer and find problems with your answer.
        """)
    
        chain = (
            info_retrieval_prompt
            | model
            | self.output_parser
            
        )
        output = chain.invoke({"question": question, "answer": answer})
        return output
        
    def run_single_self_review(self, inputs, model_name, verbose, output_file_name=""):
        
        llm_model = Ollama(model=model_name)
        
        full_prompt = self.prompt.format(goal=inputs["goal"], step0=inputs["step0"],step1=inputs["step1"])
        if verbose:
            print("----------------------------------------------------------------------")
            print(f"model_name: {model_name}")
            print(f"prompt: {full_prompt}")
        else:
            self.write_string_to_buffer("----------------------------------------------------------------------")
            self.write_string_to_buffer(f"model_name: {model_name}")
            self.write_string_to_buffer(f"prompt: {full_prompt}")
                        
        chain = (
            self.prompt
            | llm_model
            | self.output_parser)

        chain_of_thought = chain.invoke(inputs)
        
        critique = self.run_self_review(full_prompt, chain_of_thought, llm_model)
        
        final_prompt = PromptTemplate.from_template("""
        Question: {question}
                
        Critique: {critique}
        
        Based on the problems you found, improve your answer.
        """)
        
        final_chain = (
            final_prompt
            | llm_model
            | self.output_parser
        )
        
        improved_answer = final_chain.invoke({"question": full_prompt, "answer": chain_of_thought, "critique": critique})
        
        classifier_output = self.run_retry_classifier(improved_answer, 3, verbose)
        final_answer = classifier_output[0]
        num_retries = classifier_output[1]
        
        if verbose:
            print(f"Classifier finished...\n")
            print(f"chain_of_thought: {chain_of_thought}\n")
            print(f"critique: {critique}\n")
            print(f"improved_answer: {improved_answer}")
            print(f"final_answer: {final_answer}")
            print(f"correct_answer: {inputs['label']}")
        else:
            self.write_string_to_buffer(f"classifier finished....\n")
            self.write_string_to_buffer(f"chain_of_thought: {chain_of_thought}")
            self.write_string_to_buffer(f"critique: {critique}\n")
            self.write_string_to_buffer(f"improved_answer: {improved_answer}\n")
            self.write_string_to_buffer(f"final_answer: {final_answer}")
            self.write_string_to_buffer(f"correct_answer: {inputs['label']}")
            self.write_buffer_to_file(output_file_name)
        return  classifier_output
    

    def write_string_to_buffer(self, input_string):
        self.logs.append(input_string)
    
    def write_buffer_to_file(self, filename):
        with open(filename, 'a') as file:
            for log in self.logs:
                file.write("\n"+log)
            self.logs = []
        
    def clear_existing_file(self, filename):
         with open(filename, 'w') as file:
            file.write("")

### Single Prompt + classifier

In [25]:
simple_QA_chain = ChainManager()

simple_QA_chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial of how to {goal}
MCQ:
A: {step0}
B: {step1}
C: {step2}
D: {step3}

""")

simple_QA_chain.run_batch_query(False, 350, "/Users/kohjunkai/Desktop/step_simple_QA.txt")
simple_QA_chain.df.to_csv('/Users/kohjunkai/Desktop/step_simple_QA.csv', index=False)

Processing queries: 100%|█████████████████████| 350/350 [1:22:02<00:00, 14.07s/it]
Processing queries: 100%|███████████████████████| 350/350 [31:38<00:00,  5.43s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:03:01<00:00, 10.80s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:26:18<00:00, 14.79s/it]


In [14]:
simple_QA_chain = ChainManager()

simple_QA_chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial of how to {goal}
MCQ:
A: {step0}
B: {step1}
C: {step2}
D: {step3}
You are to provide a clear explanation for your choice.
At the end of your explanation, explicitly state which step belongs in the tutorial.
""")

simple_QA_chain.run_batch_query(False, 10, "/Users/kohjunkai/Desktop/without_classifier.txt")
simple_QA_chain.df.to_csv('/Users/kohjunkai/Desktop/without_classifier.csv', index=False)

Processing queries:   5%|█▏                      | 1/20 [00:06<01:59,  6.27s/it]

----------------------------------------------------------------------
model_name: llama2:text
prompt: 
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial.

For example:
Determine which step belongs in the tutorial of how to Send Money from India
A: Have your forms reviewed if possible.
B: Choose how the funds will be distributed.
C: Develop a personal tagline.
D: Wash your hair with an antifungal shampoo.

Answer: B

Determine which step belongs in the tutorial of how to Get 1k Followers on Instagram
A: Add a relevant, informative bio.
B: Find strength in numbers.
C: Understand how restrictive modifiers work.
D: Write the two-column proof as an outline.

Answer: A

Determine which step belongs in the tutorial of how to Solve an Algebraic Expression
MCQ:
A: Learn how to observe your own thoughts.
B: Perform a waste audit.
C: Know the order of operations.
D: Find your loan information.

Answer:

output: D

Determine 

Processing queries:  10%|██▍                     | 2/20 [00:08<01:10,  3.91s/it]

----------------------------------------------------------------------
model_name: llama2:text
prompt: 
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial.

For example:
Determine which step belongs in the tutorial of how to Send Money from India
A: Have your forms reviewed if possible.
B: Choose how the funds will be distributed.
C: Develop a personal tagline.
D: Wash your hair with an antifungal shampoo.

Answer: B

Determine which step belongs in the tutorial of how to Get 1k Followers on Instagram
A: Add a relevant, informative bio.
B: Find strength in numbers.
C: Understand how restrictive modifiers work.
D: Write the two-column proof as an outline.

Answer: A

Determine which step belongs in the tutorial of how to Fake Food Poisoning
MCQ:
A: Go to a historical preservation.
B: Place time restrictions on impulsive thoughts.
C: Decide on the background or the foreground.
D: Invent a dicey dining situation.

Answ

Processing queries:  15%|███▌                    | 3/20 [00:09<00:39,  2.34s/it]

----------------------------------------------------------------------
model_name: llama2:text
prompt: 
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial.

For example:
Determine which step belongs in the tutorial of how to Send Money from India
A: Have your forms reviewed if possible.
B: Choose how the funds will be distributed.
C: Develop a personal tagline.
D: Wash your hair with an antifungal shampoo.

Answer: B

Determine which step belongs in the tutorial of how to Get 1k Followers on Instagram
A: Add a relevant, informative bio.
B: Find strength in numbers.
C: Understand how restrictive modifiers work.
D: Write the two-column proof as an outline.

Answer: A

Determine which step belongs in the tutorial of how to Prevent Toxic Shock Syndrome
MCQ:
A: Draw lines at the bottom loop and then they will look like feathers.
B: Follow instructions carefully when using vaginal contraceptives.
C: Spot test your blood o

Processing queries:  20%|████▊                   | 4/20 [00:09<00:25,  1.60s/it]

----------------------------------------------------------------------
model_name: llama2:text
prompt: 
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial.

For example:
Determine which step belongs in the tutorial of how to Send Money from India
A: Have your forms reviewed if possible.
B: Choose how the funds will be distributed.
C: Develop a personal tagline.
D: Wash your hair with an antifungal shampoo.

Answer: B

Determine which step belongs in the tutorial of how to Get 1k Followers on Instagram
A: Add a relevant, informative bio.
B: Find strength in numbers.
C: Understand how restrictive modifiers work.
D: Write the two-column proof as an outline.

Answer: A

Determine which step belongs in the tutorial of how to Report a School for Discriminatory Discipline
MCQ:
A: Consider filing an institutional grievance.
B: Double check all payment amounts.
C: Copy and mail your form.
D: Identify a suspected fraudulent m

Processing queries:  20%|████▊                   | 4/20 [00:09<00:39,  2.46s/it]

----------------------------------------------------------------------
model_name: llama2:text
prompt: 
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial.

For example:
Determine which step belongs in the tutorial of how to Send Money from India
A: Have your forms reviewed if possible.
B: Choose how the funds will be distributed.
C: Develop a personal tagline.
D: Wash your hair with an antifungal shampoo.

Answer: B

Determine which step belongs in the tutorial of how to Get 1k Followers on Instagram
A: Add a relevant, informative bio.
B: Find strength in numbers.
C: Understand how restrictive modifiers work.
D: Write the two-column proof as an outline.

Answer: A

Determine which step belongs in the tutorial of how to List for Sale by Owner Using MLS Listing
MCQ:
A: Learning about more than.
B: Know what a payment gateway is.
C: Know what a finance charge is.
D: Know what the Multiple Listing Service system is.

A




IndexError: string index out of range

In [26]:
simple_QA_chain.evaluate_order()

llama2: 0.44857142857142857
mistral: 0.5542857142857143
orca-mini:7b: 0.32857142857142857
qwen:7b: 0.48


### Chain of thought  + classifier

In [28]:
chain = ChainManager()

chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial of how to {goal}
MCQ:
A: {step0}
B: {step1}
C: {step2}
D: {step3}
You are to provide a clear explanation for your choice.
At the end of your explanation, explicitly state which step belongs in the tutorial.
""")

chain.run_batch_query(False, 350, "/Users/kohjunkai/Desktop/step_COT.txt")
chain.df.to_csv('/Users/kohjunkai/Desktop/step_COT.csv', index=False)

Processing queries: 100%|█████████████████████| 350/350 [1:40:43<00:00, 17.27s/it]
Processing queries: 100%|███████████████████████| 350/350 [47:26<00:00,  8.13s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:19:19<00:00, 13.60s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:30:03<00:00, 15.44s/it]


### Self Critique + classifier

In [35]:
info_gen_chain = ChainManager()

info_gen_chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question.
Given four choices, you are to determine which step belongs in the tutorial of how to {goal}
MCQ:
A: {step0}
B: {step1}
C: {step2}
D: {step3}
You are to provide a clear explanation for your choice.
At the end of your explanation, explicitly state which step belongs in the tutorial.
""")

info_gen_chain.run_batch_query(False,200, "/Users/kohjunkai/Desktop/step_self_critque.txt")
info_gen_chain.df.to_csv('/Users/kohjunkai/Desktop/step_self_critque.csv', index=False)

Processing queries: 100%|█████████████████████| 200/200 [1:55:34<00:00, 34.67s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:28:35<00:00, 26.58s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:25:33<00:00, 25.67s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:19:43<00:00, 23.92s/it]


In [92]:
pd.

llama2: 0.6
mistral: 0.5
orca-mini:7b: 0.4
qwen:7b: 0.5
