In [2]:
import pandas as pd

goal = pd.read_csv('./datasets_human_readable/goal_test.csv', encoding='ISO-8859-1')
goal['label'] = goal['label'].map({0:'A', 1:'B',2:"C",3:"D"})
goal


Unnamed: 0,step,goal0,goal1,goal2,goal3,label
0,Have the butcher make them.,Buy iBooks on a Mac,Buy Stuff on Sweatcoin on iPhone or iPad,Buy Microfiber Towels,Buy Burger Patties,D
1,Avoid caffeine and alcohol the day before.,Keep Your Friends Awake for an All Nighter,Listen to Music Without Getting Caught,Feel Alert when You Wake up in the Mornings,Watch TV Without Waking Up Your Siblings,C
2,Exfoliate at least three times a week.,Get Rid of Drug Dealers in Your Neighborhood,Get Rid of and Prevent Flour Mites,Get Rid of Oak Mites,Get Rid of Blackheads and Whiteheads Using Com...,D
3,Learn more than one instrument.,Be a Good Actor or Actress,Be a Good Musician,Be a Good Basketball Player,Be a Good Athlete,B
4,Get psychological help for addictive behaviors.,Fix Your Whole Life,Give Away a Puppy,Spoil Your Wife,Spoil Your Cat,A
...,...,...,...,...,...,...
1698,Look for destinations and ports of call that w...,"Enjoy a Trip to Nakhchivan, Azerbaijan",Enjoy a Caribbean Cruise With Children,Enjoy a Visit to a Water Park,Enjoy a Museum,B
1699,Rent out a restaurant or an entertainment center.,Plan a Cruise,Plan a Preï¿½ï¿½ï¿½Candy Hallowee,Plan a One Direction Party,Plan a Sweet 16 Party,D
1700,"Lay out specific, relatable, achievable expect...",Gut a Pig,Kill a Stinkhorn Fungus,Gut a Squirrel,Ground Your Child,D
1701,Bring earplugs and a sleeping mask if youï¿½ï¿...,Find Cheap Hotels in Napa Valley,Find Study Material Online,Find Festivals and Special Events in New York,Find Hostels in Europe,D


# Setup Common Functions

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.enum import EnumOutputParser
from langchain.output_parsers.fix import OutputFixingParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain.llms import Ollama
from tqdm import tqdm 
import time 
from enum import Enum
    

class ChainManager:
    def __init__(self):
        self.prompt = PromptTemplate.from_template("Tell me a short joke about {input}")
        self.output_parser = StrOutputParser()
        self.df = goal
        self.model_list = ["llama2:text","mistral", "orca-mini:7b", "qwen:7b"]
        self.logs = []
        
    def run_single_query(self, inputs, model_name, verbose, output_file_name=""):
        full_prompt = self.prompt.format(step=inputs["step"], goal0=inputs["goal0"],goal1=inputs["goal1"],goal2=inputs["goal2"], goal3=inputs["goal3"])
        if verbose:
            print("----------------------------------------------------------------------")
            print(f"model_name: {model_name}")
            print(f"prompt: {full_prompt}")
        else:
            self.write_string_to_buffer("----------------------------------------------------------------------")
            self.write_string_to_buffer(f"model_name: {model_name}")
            self.write_string_to_buffer(f"prompt: {full_prompt}")
                
        chain = (
            self.prompt
            | Ollama(model=model_name)
            | self.output_parser)

        chain_of_thought = chain.invoke(inputs)
        classifier_output = self.run_retry_classifier(chain_of_thought, 3, verbose)
        final_answer = classifier_output[0]
        num_retries = classifier_output[1]
        
        if verbose:
            print(f"Classifier finished...\n")
            print(f"chain_of_thought: {chain_of_thought}\n")
            print(f"final_answer: {final_answer}")
            print(f"correct_answer: {inputs['label']}")
        else:
            self.write_string_to_buffer(f"classifier finished....\n")
            self.write_string_to_buffer(f"chain_of_thought: {chain_of_thought}\n")
            self.write_string_to_buffer(f"final_answer: {final_answer}")
            self.write_string_to_buffer(f"correct_answer: {inputs['label']}")
            self.write_buffer_to_file(output_file_name)
        return classifier_output
    
    def run_batch_query(self, verbose, batch_size, output_file_name=""):
        self.df = self.df.iloc[0:0+batch_size].copy()
        input_list = self.df.to_dict('records')
        if output_file_name != "":
            self.clear_existing_file(output_file_name)
        for model_name in self.model_list:
            results = []
            num_retries = []
            for item in tqdm(input_list, desc="Processing queries"):
#                     result = self.run_single_query(item, model_name, verbose, output_file_name) 
                    result = self.run_single_self_review(item, model_name, verbose, output_file_name) 
                    results.append(result[0])
                    num_retries.append(result[1])
                    
            self.df[model_name] = results 
            self.df[model_name+"_retries"] = num_retries 

    def evaluate_order(self):
        for model_name in self.model_list:
            binary_results = self.df[model_name].str.strip().str[0]
            correct_predictions = (self.df['label'] == binary_results).sum()
            total_predictions = len(self.df)
            accuracy = correct_predictions / total_predictions
            print(f"{model_name}: {accuracy}")
        
    
    def verify_output(self, mcq_choice):
        if mcq_choice == "A" or mcq_choice == "B" or mcq_choice == "C" or mcq_choice == "D":
            return True
        else: 
            return False        
        
    def run_classifier(self, initial_answer):
        
        classifier_prompt = PromptTemplate.from_template("""
        You are recieving an explanation from a language model about its choice for an mcq question.
        You are a mcq classifier. You are to select the answer based on the explanation provided.
        You are not to explain or mention anything other than provide the answer for the mcq.
        If not enough information, randomly select one.
        Please do not give me anything other than one letter thanks!
        MCQ Choices: A, B, C, D            

        Explanation: {initial_answer}
        Answer:
        """)
        
        chain = (
            classifier_prompt
            | Ollama(model="mistral")
            | self.output_parser
            
        )
        output = chain.invoke({"initial_answer": initial_answer})
        return output
    
    def run_retry_classifier(self, initial_answer, max_tries, verbose, output_file_name=""):
        if verbose:
            print("Running classifier....")
        else:
            self.write_string_to_buffer("Running classifier....")

        mcq_choice = 0
        for i in range(max_tries):
            classifier_output = self.run_classifier(initial_answer)
            if verbose:
                print(f"retry_classifier_{i+1}: {classifier_output}")
            else:
                self.write_string_to_buffer(f"retry_classifier_{i+1}: {classifier_output}")
            mcq_choice = classifier_output.strip()[0].upper()
            if self.verify_output(mcq_choice):
                return (mcq_choice, i+1)
        return (mcq_choice, max_tries)
    
    def run_self_review(self, question, answer, model):
        info_retrieval_prompt = PromptTemplate.from_template("""
        Question: {question}
        
        Previous answer: {answer}
        
        Review your previous answer and find problems with your answer.
        """)
    
        chain = (
            info_retrieval_prompt
            | model
            | self.output_parser
            
        )
        output = chain.invoke({"question": question, "answer": answer})
        return output
        
       
    def run_single_self_review(self, inputs, model_name, verbose, output_file_name=""):
        
        llm_model = Ollama(model=model_name)
        
        full_prompt = self.prompt.format(step=inputs["step"], goal0=inputs["goal0"],goal1=inputs["goal1"],goal2=inputs["goal2"], goal3=inputs["goal3"])
        if verbose:
            print("----------------------------------------------------------------------")
            print(f"model_name: {model_name}")
            print(f"prompt: {full_prompt}")
        else:
            self.write_string_to_buffer("----------------------------------------------------------------------")
            self.write_string_to_buffer(f"model_name: {model_name}")
            self.write_string_to_buffer(f"prompt: {full_prompt}")
                        
        chain = (
            self.prompt
            | llm_model
            | self.output_parser)

        chain_of_thought = chain.invoke(inputs)
        
        critique = self.run_self_review(full_prompt, chain_of_thought, llm_model)
        
        final_prompt = PromptTemplate.from_template("""
        Question: {question}
                
        Critique: {critique}
        
        Based on the problems you found, improve your answer.
        """)
        
        final_chain = (
            final_prompt
            | llm_model
            | self.output_parser
        )
        
        improved_answer = final_chain.invoke({"question": full_prompt, "answer": chain_of_thought, "critique": critique})
        
        classifier_output = self.run_retry_classifier(improved_answer, 3, verbose)
        final_answer = classifier_output[0]
        num_retries = classifier_output[1]
        
        if verbose:
            print(f"Classifier finished...\n")
            print(f"chain_of_thought: {chain_of_thought}\n")
            print(f"critique: {critique}\n")
            print(f"improved_answer: {improved_answer}")
            print(f"final_answer: {final_answer}")
            print(f"correct_answer: {inputs['label']}")
        else:
            self.write_string_to_buffer(f"classifier finished....\n")
            self.write_string_to_buffer(f"chain_of_thought: {chain_of_thought}")
            self.write_string_to_buffer(f"critique: {critique}\n")
            self.write_string_to_buffer(f"improved_answer: {improved_answer}\n")
            self.write_string_to_buffer(f"final_answer: {final_answer}")
            self.write_string_to_buffer(f"correct_answer: {inputs['label']}")
            self.write_buffer_to_file(output_file_name)
        return  classifier_output
    
    def write_string_to_buffer(self, input_string):
        self.logs.append(input_string)
    
    def write_buffer_to_file(self, filename):
        with open(filename, 'a') as file:
            for log in self.logs:
                file.write("\n"+log)
            self.logs = []
        
    def clear_existing_file(self, filename):
         with open(filename, 'w') as file:
            file.write("")

### Single Prompt + classifier

In [19]:
simple_QA_chain = ChainManager()

simple_QA_chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question.
You are given a step that is included in a one of the four tutorial choices. 
Given four choices, you are to determine which tutorial this step belongs to.
Step: {step}
MCQ:
A: {goal0}
B: {goal1}
C: {goal2}
D: {goal3}
""")

simple_QA_chain.run_batch_query(False, 350, "/Users/kohjunkai/Desktop/goal_simple_QA.txt")
simple_QA_chain.df.to_csv('/Users/kohjunkai/Desktop/goal_simple_QA.csv', index=False)

Processing queries: 100%|█████████████████████| 350/350 [1:13:58<00:00, 12.68s/it]
Processing queries: 100%|███████████████████████| 350/350 [26:30<00:00,  4.54s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:01:02<00:00, 10.46s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:27:34<00:00, 15.01s/it]


In [22]:
simple_QA_chain.evaluate_order()

llama2: 0.5857142857142857
mistral: 0.6285714285714286
orca-mini:7b: 0.52
qwen:7b: 0.6857142857142857


### Chain of thought  + classifier

In [21]:
chain = ChainManager()

chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question. 
You are given a step that is included in a one of the four tutorial choices. 
Given four choices, you are to determine which tutorial this step belongs to.
Step: {step}
MCQ:
A: {goal0}
B: {goal1}
C: {goal2}
D: {goal3}
You are to provide a clear explanation for your choice..
At the end of your explanation, explicitly state which tutorial the step belongs to.
""")

chain.run_batch_query(False, 350, "/Users/kohjunkai/Desktop/goal_COT.txt")
chain.df.to_csv('/Users/kohjunkai/Desktop/goal_COT.csv', index=False)

Processing queries: 100%|█████████████████████| 350/350 [1:29:22<00:00, 15.32s/it]
Processing queries: 100%|███████████████████████| 350/350 [44:56<00:00,  7.70s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:19:02<00:00, 13.55s/it]
Processing queries: 100%|█████████████████████| 350/350 [1:40:59<00:00, 17.31s/it]


In [None]:
chain.df.to_csv('order_results.csv', index=False)

### Self Critique + classifier

In [28]:
critique_chain = ChainManager()

critique_chain.prompt = PromptTemplate.from_template("""
You are answering a multiple choice question. 
You are given a step that is included in a one of the four tutorial choices. 
Given four choices, you are to determine which tutorial this step belongs to.
Step: {step}
MCQ:
A: {goal0}
B: {goal1}
C: {goal2}
D: {goal3}
You are to provide a clear explanation for your choice..
At the end of your explanation, explicitly state which tutorial the step belongs to.
""")

critique_chain.run_batch_query(False,200, "/Users/kohjunkai/Desktop/goal_self_critique.txt")
critique_chain.df.to_csv('/Users/kohjunkai/Desktop/goal_self_critique.csv', index=False)

Processing queries: 100%|█████████████████████| 200/200 [1:39:34<00:00, 29.87s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:22:05<00:00, 24.63s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:15:58<00:00, 22.79s/it]
Processing queries: 100%|█████████████████████| 200/200 [1:29:12<00:00, 26.76s/it]


In [92]:
info_gen_chain.evaluate_order()

llama2: 0.6
mistral: 0.5
orca-mini:7b: 0.4
qwen:7b: 0.5
