# Generated COTs

In [None]:
!pip install instructor



In [None]:
!pip install datasets



In [None]:
import json
import random
import instructor
import re
from pathlib import Path
from typing import List, TextIO

import openai
from datasets import load_dataset
# from fire import Fire
from pydantic import BaseModel
from tqdm import tqdm


def remove_substrings_with_double_angle_brackets(input_string):
    # Define the pattern to match substrings within double angled brackets
    pattern = r"<<[^>]+>>"
    # Use the sub() function from the re module to replace matching substrings with an empty string
    result = re.sub(pattern, "", input_string)
    return result


class ReasonSample(BaseModel):

    question: str
    explanation: str = ""
    answer: str = ""
    wrong_explanation: str = ""
    wrong_answer: str = ""
    pred: str = ""


class ReasonData(BaseModel):
    samples: List[ReasonSample]

    @classmethod
    def load(cls, path: str):
        samples = []
        with open(path) as f:
            for line in f:
                raw = json.loads(line)
                samples.append(ReasonSample(**raw))

        return cls(samples=samples)

    @classmethod
    def load_gsm8k_test(cls, path: str = "gsm8k", subset: str = "main", split="test"):
        samples = []
        for raw in load_dataset(path, subset, split=split):
            explanation, answer = raw["answer"].split("####")
            explanation = remove_substrings_with_double_angle_brackets(explanation)
            samples.append(
                ReasonSample(
                    question=raw["question"].strip(),
                    explanation=explanation.strip(),
                    answer=answer.strip(),
                )
            )

        return cls(samples=samples)

    @classmethod
    def load_gsm8k_incoherent_objects(cls, split: str = "test", sample: bool = False):
        if split == "train" and sample:
            samples = [
                ReasonSample(
                    question="There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
                    explanation="There are 15 trees originally. Then there were 21 trees after the Grove workers planted some more. So there must have been 21 - 15 = 6 trees that were planted.",
                    answer="6",
                    wrong_explanation="There are 21 - 15 = 6 trees originally. Then there were 15 trees after the Grove workers planted some more. So there must have been 21 trees that were planted.",
                    wrong_answer="21",
                ),
                ReasonSample(
                    question="If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot??",
                    explanation="There are originally 3 cars. Then 2 more cars arrive. Now 3 + 2 = 5 cars are in the parking lot.",
                    answer="5",
                    wrong_explanation="There are originally 3 + 2 = 5 cars. Then 3 more cars arrive. Now 2 cars are in the parking lot",
                    wrong_answer="2",
                ),
                ReasonSample(
                    question="Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
                    explanation="Originally, Leah had 32 chocolates and her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39 pieces left in total.",
                    answer="39",
                    wrong_explanation="Originally, Leah had 32 + 42 = 74 chocolates and her sister had 32. So in total they had 74 - 35 = 39. After eating 35, they had 42 pieces left in total",
                    wrong_answer="42",
                ),
                ReasonSample(
                    question="Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
                    explanation="Jason had 20 lollipops originally. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8 lollipops.",
                    answer="8",
                    wrong_explanation="Jason had 20 - 12 = 8 lollipops originally. Then he had 20 after giving some to Denny. So he gave Denny 12 lollipops",
                    wrong_answer="12",
                ),
            ]
            return cls(samples=samples)
        else:
            return cls.load_gsm8k_test()

    @classmethod
    def load_from_name(cls, name: str, **kwargs):
        if name == "gsm8k":
            return cls.load_gsm8k_incoherent_objects(**kwargs)
        else:
            raise KeyError(name)


class Prompter(BaseModel):
    def run(self, data_train: ReasonData, sample_test: ReasonSample) -> str:
        prompt = ""
        for sample in data_train.samples:
            prompt += f"Question: {sample.question}\n"
            prompt += f"Answer: {sample.answer}\n\n"

        prompt += f"Question: {sample_test.question}\n"
        prompt += "Answer: "
        return prompt

    @staticmethod
    def get_answer(text: str) -> str:
        parts = text.split("Answer: ")
        if len(parts) >= 2:
            return parts[1]
        else:
            return text


class ChainThoughtPrompter(Prompter):
    def run(self, data_train: ReasonData, sample_test: ReasonSample) -> str:
        prompt = ""
        for sample in data_train.samples:
            prompt += f"Question: {sample.question}\n"
            prompt += f"Explanation: {sample.explanation}\n"
            prompt += f"Answer: {sample.answer}\n\n"

        prompt += f"Question: {sample_test.question}\n"
        prompt += "Explanation: "
        return prompt

    def get_explanation(self, text: str) -> str:
        assert self is not None
        return text.split("\nAnswer: ")[0]


class ContrastiveChainThoughtPrompter(Prompter):
    def run(self, data_train: ReasonData, sample_test: ReasonSample) -> str:
        prompt = ""
        for sample in data_train.samples:
            prompt += f"Question: {sample.question}\n"
            prompt += f"Explanation: {sample.explanation}\n"
            prompt += f"Answer: {sample.answer}\n"
            prompt += f"Wrong explanation: {sample.wrong_explanation}\n"
            prompt += f"Wrong Answer: {sample.wrong_answer}\n\n"

        prompt += f"Question: {sample_test.question}\n"
        prompt += "Explanation: "
        return prompt



In [None]:
data = ReasonData.load_from_name("gsm8k", split="test", sample=True)
data.samples = random.sample(data.samples, k=20)

In [None]:
data_test = random.sample(data.samples, k=1)

In [None]:
data_test

[ReasonSample(question='Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?', explanation='If Seattle has 20 sheep, Charleston has 4 * 20 sheep = 80 sheep\nToulouse has twice as many sheep as Charleston, which is 2 * 80 sheep = 160 sheep\nTogether, the three has 20 sheep + 160 sheep + 80 sheep = 260 sheep', answer='260', wrong_explanation='', wrong_answer='', pred='')]

In [None]:
questions_test = [quest.question for quest in data_test]

In [None]:
questions_test

['Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?']

In [None]:
questions_gsm8k = [quest.question for quest in data.samples]

In [None]:
len(questions_gsm8k)

20

In [None]:
from scipy.stats import entropy
def generate_uncertainty_qes(answer):
    uncertainty_record = {'entropy':float, 'occurrence':{}}

    # check uncertainty
    for pred_ans in answer:
        if pred_ans in uncertainty_record['occurrence']:
            uncertainty_record['occurrence'][pred_ans] += 1 # increment answer occurrence
        else:
            uncertainty_record['occurrence'][pred_ans] = 1 # first occurence

    # calculate the entropy for all dataset
    frequency_list = list(uncertainty_record['occurrence'].values())
    uncertainty_record['entropy'] = entropy(frequency_list)

    # calculate the disagreement for all dataset
    uncertainty_record['disagreement'] = len(uncertainty_record['occurrence'])

    return uncertainty_record

In [None]:
uncertainty_records_sorted = sorted(uncertainty_records, key=lambda x: x['entropy'], reverse=True)

In [None]:
uncertainty_records = [generate_uncertainty_qes(answer) for answer in pred_answers]

In [None]:
import instructor
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from textwrap import dedent
import asyncio
import nest_asyncio
from collections import Counter
import random
import os


client = AsyncOpenAI(
    base_url="https://api.chatanywhere.tech/v1",
    api_key='sk-U7ti37t3osaeWme7m88923xFY2UMGVQLkeydTAywi3xI7TOP'
)

client = instructor.from_openai(client)

class Response(BaseModel):
    question: str
    Reasoning_content: str
    answer: str


async def re2(query, thinking_prompt):
    return await client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=Response,
        messages=[
            {
                "role": "system",
                "content": f"Read the question again: {query} {thinking_prompt}",
            },
        ],
    )

async def complexity_based_consistency(
    query: str, thinking_prompt: str, samples: int, top_k: int
):
    generated_responses = [
        re2(query, thinking_prompt) for _ in range(samples)
    ]
    responses = await asyncio.gather(*generated_responses)
    sorted_responses = sorted(responses, key=lambda x: len(x.Reasoning_content), reverse=True)
    top_responses = sorted_responses[:top_k]
    return top_responses

thinking_prompt = "Let's think step by step."
number_of_reasoning_chains = 5
top_k_to_sample = 3
# Get the event loop
loop = asyncio.get_event_loop()
responses = []
for query in questions_gsm8k:
  # Apply nest_asyncio to the loop
  nest_asyncio.apply(loop)

  # Run the coroutine using the loop's run_until_complete method
  response = loop.run_until_complete(complexity_based_consistency(query, thinking_prompt, number_of_reasoning_chains, top_k_to_sample))
  responses.append(response)


In [None]:
for quest in responses:
    for q in quest:
      print(q.question)

Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?
Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?
Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?
In November, a toy was $40. In December, the price increased by 80%. In January, the price decreased by 50%. What was the price of the toy after it was discounted in January?
In November, a toy was $40. In December, the price increased by 80%. In January, the price decreased by 50%. What was the price of the toy after it was discounted in January?
In November, a toy was $40. In December, the price increased by 80%. In January, the price decreased by 50%. What was the price of the toy after it was discounted in January?
How much will the store have received after all the balls are sold?
How much will the store have received after all the balls are sold?
How much will

In [None]:
import numpy as np
import torch
import torch.nn as nn
from scipy.stats import entropy
from transformers import RobertaTokenizerFast

def similarity(qa, qb):
    tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
    qa_tokens = tokenizer.encode(qa, return_tensors='pt')  # Encode and convert to PyTorch tensor
    qb_tokens = tokenizer.encode(qb, return_tensors='pt')  # Encode and convert to PyTorch tensor
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)

    # Convert the token IDs to floating-point type
    qa_tokens = qa_tokens[0].type(torch.float32)
    qb_tokens = qb_tokens[0].type(torch.float32)

    # Pad the shorter tensor to match the length of the longer tensor
    max_len = max(len(qa_tokens), len(qb_tokens))
    qa_tokens = torch.cat([qa_tokens, torch.zeros(max_len - len(qa_tokens), dtype=torch.float32)])
    qb_tokens = torch.cat([qb_tokens, torch.zeros(max_len - len(qb_tokens), dtype=torch.float32)])


    # if you want to calculate the similarity using the token IDs directly:
    return cos(qa_tokens, qb_tokens)


In [None]:
question_answering = []
for quest in responses:
    for q in quest:
        content = {'question': q.question,'reasoning content' : q.Reasoning_content, 'answer': q.answer}
        question_answering.append(content) # Append the 'content' dictionary directly

In [None]:
unique_questions = []
seen_questions = set()

for item in question_answering:
    if item['question'] not in seen_questions:
        unique_questions.append(item)
        seen_questions.add(item['question'])

[{'question': 'Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?', 'reasoning content': 'Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.', 'answer': '$93'}, {'question': 'Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?', 'reasoning content': 'Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 31 days = 93 dollars.', 'answer': '$93'}, {'question': 'In November, 

In [None]:
question_test = questions_test[0]

In [None]:
for question in unique_questions[:2]:
  print(question['question'])
  print(question['reasoning content'])
  print(question['answer'])

Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?
Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.
$93
Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?
Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 31 days = 93 dollars.
$93


In [None]:
sim = {'question':str,'reasoning content':str, 'similarity':float,'answer':str}
sims = []
for question in unique_questions:
    print(question['question'])
    print(question['reasoning content'])
    sim['similarity'] = similarity(question['question'], question_test)
    sim['question'] = question['question']
    sim['reasoning content'] = question['reasoning content']
    sim['answer'] = question['answer']
    sims.append(sim)
    sim = {'question':str,'reasoning content':str, 'similarity':float,'answer':str}

Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?
Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.
Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?
Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 31 days = 93 dollars.
In November, a toy was $40. In December, the price increased by 80%. In January, the price decreased by 50%. What was the price of the toy after

In [None]:
sims

[{'question': 'Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?',
  'reasoning content': 'Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.',
  'similarity': tensor(0.9894),
  'answer': '$93'},
 {'question': 'Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?',
  'reasoning content': 'Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 31 days = 93 dollars.',
  'sim

In [None]:
similarities = torch.tensor([item['similarity'].item() for item in sims])

# Select the top-k entries
k = 5
topk_values, topk_indices = torch.topk(similarities, k)

# Retrieve top-k entries based on indices
top_k_results = [sims[idx] for idx in topk_indices]


In [None]:
topk_indices

tensor([ 0,  1, 17, 35, 37])

In [None]:
similarities[:5]

tensor([0.9894, 0.9873, 0.0449, 0.6600, 0.2846])

In [None]:
top_k_results

[{'question': 'Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?',
  'reasoning content': 'Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.',
  'similarity': tensor(0.9894),
  'answer': '$93'},
 {'question': 'Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?',
  'reasoning content': 'Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 31 days = 93 dollars.',
  'sim

In [None]:
def generate_question_and_answer_pair(
    question_and_answers: list[dict[str, str]]
) -> list[str]:
    """
    Generates a list of question and answer pairs in a specific format.

    Args:
        question_and_answers: A list of dictionaries, where each dictionary
            contains a question, reasoning content, and answer.

    Returns:
        A list of strings, where each string represents a question and answer pair
        formatted with XML-like tags.
    """
    return [
        dedent(
            f"""
        <example>
        <question>{question['question']}</question>
        <reasoning_content>{question['reasoning content']}</reasoning_content>
        <answer>{question['answer']}</answer>
        </example>
        """
        )
        for question in question_and_answers # Iterate through each dictionary in the list
    ]

In [None]:
formatted_examples  = generate_question_and_answer_pair(top_k_results)

In [None]:
formatted = "\n".join(formatted_examples)

In [None]:
formatted

"\n<example>\n<question>Jen got 3 fish. They each need $1 worth of food a day. How much does she spend on food in the month of May?</question>\n<reasoning_content>Jen has 3 fish and each fish requires $1 worth of food per day. Therefore, the total daily cost for feeding the fish is 3 fish * $1 = $3 per day. The month of May has 31 days. To find the total cost for the month of May, multiply the daily cost by the number of days: $3 * 31 days = $93. Thus, Jen spends a total of $93 on food for her fish in May.</reasoning_content>\n<answer>$93</answer>\n</example>\n\n\n<example>\n<question>Jen got 3 fish.  They each need $1 worth of food a day.  How much does she spend on food in the month of May?</question>\n<reasoning_content>Each fish needs $1 worth of food a day, and she has 3 fish. Therefore, she spends $3 a day on food for the fish. The month of May has 31 days. So, to find out the total amount spent in May, we multiply the daily spending by the number of days in May: 3 dollars/day * 

In [None]:
class Response(BaseModel):
    reference: str
    question: str
    reasoning_content: str
    answer: str


In [None]:
query = questions_test[0]
query

'Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?'

In [None]:
def generate_response(examples, query: str):
    return asyncio.run(client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=Response,
        messages=[
            {
                "role": "user",
                "content": dedent(
                    f"""
                    <reference>
                    {examples}
                    which one reference would be the most helpful for you to answer the target question?
                    </reference>
                    <target question>
                    {query}
                    </target question>
                    Let's think step by step
                    """
                ),
            }
        ],
    )
)

In [None]:
rp = generate_response(formatted, query)

In [None]:
rp.model_dump_json(indent=2)

'{\n  "reference": "<example>\\n<question>How many days does Mark have to save his money for a bike?</question>\\n<reasoning_content>Mark currently has $50 and needs a total of $300 for the bike. First, we need to find out how much more money he needs to save. To do this, we subtract the amount he has from the cost of the bike: 300 - 50 = 250. Next, we need to determine how many days it will take him to earn the required amount. Since Mark earns $10 per day, we divide the amount he needs to save by his daily earnings: 250 ÷ 10 = 25. Therefore, Mark has to save for 25 days to be able to buy the bike.</reasoning_content>\\n<answer>25 days</answer>\\n</example>",\n  "question": "Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?",\n  "reasoning_content": "To find out how many sheep Toulouse, Charleston, and Seattle have together, we can break this down i

In [None]:
rp.reference

'<example>\n<question>How many days does Mark have to save his money for a bike?</question>\n<reasoning_content>Mark currently has $50 and needs a total of $300 for the bike. First, we need to find out how much more money he needs to save. To do this, we subtract the amount he has from the cost of the bike: 300 - 50 = 250. Next, we need to determine how many days it will take him to earn the required amount. Since Mark earns $10 per day, we divide the amount he needs to save by his daily earnings: 250 ÷ 10 = 25. Therefore, Mark has to save for 25 days to be able to buy the bike.</reasoning_content>\n<answer>25 days</answer>\n</example>'

In [None]:
rp.question

'Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?'

In [None]:
for r in rp.reasoning_content:
  print(r.model_dump_json(indent=2))

AttributeError: 'str' object has no attribute 'model_dump_json'

In [None]:
rp.answer

'260 sheep.'

In [None]:
query = question_test
response = asyncio.run(generate_response(formatted_examples, query))
print(response.answer)

The most helpful reference would be the example relating to Jen's fish food expenses ($93 for the month of May), as it directly involves calculating total expenses over a month.


Exception ignored in: <coroutine object AsyncInstructor.create at 0x78a1d7dbb990>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py", line 27, in run
KeyError: '__builtins__'
Exception ignored in: <coroutine object AsyncInstructor.create at 0x78a1d7dbb990>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py", line 27, in run
KeyError: '__builtins__'


In [None]:
query = question_test
response = generate_response(formatted, query)

Exception ignored in: <coroutine object AsyncInstructor.create at 0x78a1d7dbbd80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py", line 27, in run
KeyError: '__builtins__'
Exception ignored in: <coroutine object AsyncInstructor.create at 0x78a1d7dbbd80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py", line 27, in run
KeyError: '__builtins__'
