In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ollama_completion import modify_problem

In [None]:
df = pd.read_parquet("../gsm_8k/train.parquet")
df

In [None]:
import re

def find_answer(s):
    ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
    match = ANS_RE.search(s)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return "[invalid]"

In [None]:
import re

def get_equations(s):
    matches = re.findall(r"(<<.*>>)(\-?[0-9\.\,]+)", s)
    equations = []
    for match in matches:
        equation, answer = match
        equations.append((equation, answer))

    return equations

In [None]:
for i, row in df.iterrows():
    print(row["answer"])
    print(get_equations(row["answer"]))
    print(find_answer(row["answer"]))
    print()

In [None]:
for i, row in df.iterrows():
    print(row['answer'])
    print(row['answer'].split('\n'))
    print()

In [None]:
def remove_equations(solution):
    equations = get_equations(solution)
    for eqn, ans in equations:
        solution = solution.replace(f"{eqn}{ans}", f"{ans}")
    return solution

In [None]:
from tqdm import tqdm
import random

In [None]:
perturbed_problems = []
num_problems = 120

sample = df.sample(num_problems, replace=False, random_state=42)

print(len(sample))

for id, problem in tqdm(sample.iterrows()):

    final_prompt, perturbed_answer = modify_problem(problem.question, problem.answer)

    clean_answer = remove_equations(problem.answer)
    perturbed_answer = remove_equations(perturbed_answer)
    correct_answer = find_answer(problem.answer)

    print(f"Problem: {id}")
    print(f"Original Answer: {problem['answer']}")
    print('-'*50)
    print(f"Perturbed Answer: {perturbed_answer}")
    print("="*50)

    perturbed_problems.append({
        "id": id,
        "question": problem.question,
        "solution": problem.answer,
        "clean_solution": clean_answer,
        "perturbed_solution": perturbed_answer,
        "answer": correct_answer,
        "perturbed_answer": "N/A",
        "final_prompt": final_prompt
    })

In [None]:
len(perturbed_problems)

In [None]:
df = pd.DataFrame.from_dict(perturbed_problems)

df.to_csv("perturbed/extra_steps.csv", index=False, sep="\t")

In [71]:
import pandas as pd

In [72]:
df = pd.read_csv("perturbed/extra_steps.csv", sep="\t")

In [73]:
df

Unnamed: 0,id,question,solution,clean_solution,perturbed_solution,answer,perturbed_answer,final_prompt
0,1297,In Professor Plum's biology class there are 40...,"We start with the initial numbers of students,...","We start with the initial numbers of students,...","We start with the initial numbers of students,...",8,,\n<instructions>\nGiven an input question and ...
1,576,Diane bought twenty more apples than Cecile. I...,Diane bought 15 + 20 = <<15+20=35>>35 apples.\...,"Diane bought 15 + 20 = 35 apples.\nTherefore, ...",Diane bought twenty more apples than Cecile. I...,50,,\n<instructions>\nGiven an input question and ...
2,5462,Ann can skate 6 miles an hour. Her friend Glen...,First find how far Glenda goes in 3 hours by m...,First find how far Glenda goes in 3 hours by m...,"First, let's analyze the situation with Ann an...",42,,\n<instructions>\nGiven an input question and ...
3,4336,"Running for 2 hours, Jonah burnt 30 calories e...","When Jonah ran for 2 hours, burning 30 calorie...","When Jonah ran for 2 hours, burning 30 calorie...","When Jonah ran for 2 hours, burning 30 calorie...",90,,\n<instructions>\nGiven an input question and ...
4,7105,The city of Richmond has 1000 more people than...,Victoria has 3000-1000=<<3000-1000=2000>>2000 ...,Victoria has 3000-1000=2000 people.\nBeacon ha...,Victoria has 3000-1000=2000 people.\nThe city ...,500,,\n<instructions>\nGiven an input question and ...
...,...,...,...,...,...,...,...,...
115,5816,If eight movie tickets cost 2 times as much as...,"If each movie ticket is sold at $30, the cost ...","If each movie ticket is sold at $30, the cost ...","If each movie ticket is sold at $30, the cost ...",840,,\n<instructions>\nGiven an input question and ...
116,5591,Chris wants to hold his breath underwater for ...,He still has to improve by 60 more seconds bec...,He still has to improve by 60 more seconds bec...,He still has to improve by 60 more seconds bec...,6,,\n<instructions>\nGiven an input question and ...
117,2182,"At peak hours, 11 am - 1 pm and 5 pm - 6 pm, t...",5 pm-6 pm is one hour and they measure service...,5 pm-6 pm is one hour and they measure service...,5 pm-6 pm is one hour and they measure service...,80,,\n<instructions>\nGiven an input question and ...
118,1783,Mia has 4 times as many shells as David. Ava h...,"If David has 15 shells, then Mia has 4 *15 = <...","If David has 15 shells, then Mia has 4 *15 = 6...","If David has 15 shells, then Mia has 4 * 15 = ...",195,,\n<instructions>\nGiven an input question and ...


In [74]:
from nltk.tokenize import sent_tokenize

In [75]:
def remove_answer_sentence(s):
    answer = find_answer(s)
    if answer != "[invalid]":
        s = s.replace(f"#### {answer}", "")
    

    sentences = sent_tokenize(s)
    if "answer" in sentences[-1]:
        sentences = sentences[:-1]
    
    
    sentences = [s for s in sentences if "The answer is" not in s]

    return "\n".join(sentences)

In [77]:
# Remove the last sentence from every row in perturbed_solution
df['perturbed_solution'] = df['perturbed_solution'].apply(lambda x: remove_answer_sentence(x))
df['clean_solution'] = df['clean_solution'].apply(lambda x: remove_answer_sentence(x))


In [78]:
df.to_csv("perturbed/extra_steps.csv", index=False, sep="\t")

In [80]:
# Print modified_answers of 10 examples

for i in range(10):
    print(f"Problem {df['question'][i]}")
    print(f"Original Answer: {df['answer'][i]}")
    print(f"Modified Answer: {df['perturbed_solution'][i]}")

    print("--------------------------------------------")

Problem In Professor Plum's biology class there are 40 students. Of those students, 80 percent have puppies. Of those who have puppies, 25% also have parrots. How many students have both puppies and parrots?
Original Answer: 8
Modified Answer: We start with the initial numbers of students, 40, which is a nice round number often used in classroom settings.
If we consider that 40 students might represent a typical class size in many schools, we can multiply that by 0.8 to find out how many have puppies.
So, 40 * 0.8 = 32 who own puppies.
Interestingly, 32 is also the number of degrees Fahrenheit at which water freezes, a fact that might come in handy during winter science experiments.
Now, that the number of students with puppies is 32, we can multiply that by 0.25 to find out how many own both puppies and parrots.
This gives us 32 * 0.25 = 8 who own puppies and parrots.
It's worth noting that 8 is a significant number in various cultures, often associated with prosperity and good fortun