# Extract The Numerical Answers

# Load the JSON

In [1]:
import json
import pandas

with open("Answers.json", "r") as file:
    overall_data = json.load(file)

df = pandas.DataFrame(
    list(overall_data.items()), columns=["Fermi Question", "Dictionary"]
)

with open("Units_Value.json", "r") as file:
    data_units = json.load(file)

units_df = pandas.DataFrame(list(data_units), columns=["value", "unit"])


with open("Fermi_Classification.json", "r") as file:
    data_classification = json.load(file)

# Extract the Answer

In [2]:
from dotenv import load_dotenv
import os
import openai

load_dotenv()
openai_key = os.getenv("OPEN_AI_TOKEN")
openai.api_key = openai_key


def extract_answer(question, answer, units):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": f"""Here is the fermi question given to the user to respond {question} and here is the response given {answer}.
                     I want you to extract the answer from the response in the following units: {units}""",
            }
        ],
        functions=[
            {
                "name": "extract_fermi_question_answer",
                "description": f""" 
                                Extract the numerical answer of the fermi question from the response and transform to the following units if necessary. units: {units}.
                                If the question was not answered return empty.
                                If only a number was given then assume it is in the correct units.
                                If the response given is in a fraction form transform to a decimal. 
                                    Example: 1/250,000,000 should return 4e-9
                                """,
                "parameters": {
                    "type": "object",
                    "properties": {
                        "numerical_answer": {
                            "type": "number",
                            "description": "return the numerical answer to the fermi question or return empty if it was not answered",
                        }
                    },
                    "required": ["numerical_answer"],
                },
            }
        ],
    )
    if response.choices[0].message.function_call:
        data = response.choices[0].message.function_call.arguments
        if data:
            arguments = json.loads(data)
        return arguments.get("numerical_answer")

    else:
        return None

In [3]:
df.iloc[0]["Fermi Question"]

'How many golf balls put into the worlds oceans would it take to submerge all of the land on earth from the displaced water?'

In [4]:
import replicate
import os
from dotenv import load_dotenv

load_dotenv()
replicate_token = os.getenv("REPLICATE_TOKEN")

replicate_client = replicate.Client(api_token=replicate_token)


def generate_fermi_results(question, prompt, units):

    template = f"""
    You are a helpful assistant. I'd appreciate your assistance. 
    please avoid using scientific notation or large numbers words like millions or trillions to represent the answer in your answer
    Use another form to replace it and please give your answer
    In a value to the question in the last sentence, make sure 
    I'd like the final answer to the question to be a specific value, not part of a sentence!
    Fractions should be 0. based not 1/ something
    To clarify, just return the numerical answer in the following units: {units}
    """

    output = replicate_client.run(
        "meta/meta-llama-3-70b-instruct",
        input={
            "prompt": template + f" Question: {question} \nPrompt: {prompt}"
        },
    )

    return ("".join(output),)

In [5]:
def numerical_answer(question, prompt, units):
    count = 0
    num_ans = None
    while count < 11 and not num_ans:
        print("Found One")
        answer = generate_fermi_results(question, prompt, units)
        num_ans = extract_answer(question, answer, units)
        count += 1

    return num_ans, count

In [7]:
rows = []
count_idx = 0
more_than_one_standard = []
more_than_one_specific = []
for i in range(len(df)):
    question = df.iloc[i]["Fermi Question"]
    dictionary = df.iloc[i]["Dictionary"]
    units = units_df.iloc[i]["unit"]

    possible_new_row = [0, 0, 0]

    num_ans_0 = extract_answer(question, dictionary["Level 0 Answer"], units)
    if num_ans_0 is None or num_ans_0 == "":
        num_ans_0, count = numerical_answer(question, question, units)
        possible_new_row[0] = count

    num_ans_2 = extract_answer(question, dictionary["Level 2 Answer"], units)
    if num_ans_2 is None or num_ans_2 == "":
        num_ans_2, count = numerical_answer(
            question, dictionary["Level 2 Prompt"], units
        )
        possible_new_row[1] = count

    num_ans_4 = extract_answer(question, dictionary["Level 4 Answer"], units)
    if num_ans_4 is None or num_ans_4 == "":
        num_ans_4, count = numerical_answer(
            question, dictionary["Level 4 Prompt"], units
        )
        possible_new_row[2] = count

    for i in possible_new_row:
        if i > 0:
            if question in data_classification["specific"]:
                more_than_one_specific.append(possible_new_row)

            else:
                more_than_one_standard.append(possible_new_row)

            break

    rows.append([num_ans_0, num_ans_2, num_ans_4])

    if count_idx % 10 == 0:
        print(count_idx)

    count_idx += 1

0
10
20
30
40
50
60
70
Found One
80
90
100
Found One
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Found One
Found One
Found One
280
290
300
310
320
330
Found One
340
350
360
370
380
390
400
410
420
430
440
Found One
Found One
450
460
470
480
490
500
510
520
Found One
Found One
Found One
530
Found One
540
550


### Number Of Questions that needed more than one hop

In [31]:
exp_level_0_count = 0
exp_level_2_count = 0
exp_level_4_count = 0

exp_level_0 = 0
exp_level_2 = 0
exp_level_4 = 0

for i in more_than_one_specific:
    exp_level_0 += i[0]
    exp_level_2 += i[1]
    exp_level_4 += i[2]

    exp_level_0_count = (
        exp_level_0_count + 1 if i[0] > 0 else exp_level_0_count
    )
    exp_level_2_count = (
        exp_level_2_count + 1 if i[1] > 0 else exp_level_2_count
    )
    exp_level_4_count = (
        exp_level_4_count + 1 if i[2] > 0 else exp_level_4_count
    )

print("Specific Questions BreakDown:")
print(
    f"\t # Questions Needed extra hop: Level 0: {exp_level_0_count}, Number of extra hops: {exp_level_0}"
)
print(
    f"\t # Questions Needed extra hop: Level 2: {exp_level_2_count}, Number of extra hops: {exp_level_2}"
)
print(
    f"\t # Questions Needed extra hop: Level 4: {exp_level_4_count}, Number of extra hops: {exp_level_4}"
)


exp_level_0_count = 0
exp_level_2_count = 0
exp_level_4_count = 0

exp_level_0 = 0
exp_level_2 = 0
exp_level_4 = 0

for i in more_than_one_standard:
    exp_level_0 += i[0]
    exp_level_2 += i[1]
    exp_level_4 += i[2]

    exp_level_0_count = (
        exp_level_0_count + 1 if i[0] > 0 else exp_level_0_count
    )
    exp_level_2_count = (
        exp_level_2_count + 1 if i[1] > 0 else exp_level_2_count
    )
    exp_level_4_count = (
        exp_level_4_count + 1 if i[2] > 0 else exp_level_4_count
    )

print("Standard Questions BreakDown:")
print(
    f"\t # Questions Needed extra hop: Level 0: {exp_level_0_count}, Number of extra hops: {exp_level_0}"
)
print(
    f"\t # Questions Needed extra hop: Level 2: {exp_level_2_count}, Number of extra hops: {exp_level_2}"
)
print(
    f"\t # Questions Needed extra hop: Level 4: {exp_level_4_count}, Number of extra hops: {exp_level_4}"
)

Specific Questions BreakDown:
	 # Questions Needed extra hop: Level 0: 2, Number of extra hops: 2
	 # Questions Needed extra hop: Level 2: 2, Number of extra hops: 6
	 # Questions Needed extra hop: Level 4: 1, Number of extra hops: 1
Standard Questions BreakDown:
	 # Questions Needed extra hop: Level 0: 2, Number of extra hops: 2
	 # Questions Needed extra hop: Level 2: 0, Number of extra hops: 0
	 # Questions Needed extra hop: Level 4: 1, Number of extra hops: 1


## Calculate the FP Score


In [12]:
import numpy as np
import math


def calculate_fp_score(A_prime, A):
    if A_prime == "None":
        return "Invalid Answer"

    try:
        A_prime = float(A_prime)
        A = float(A)
        score = max(0, 1 - (1 / 3) * abs(math.log10(A_prime / A)))
        return score
    except ValueError:
        return "Invalid Answer"

In [13]:
fp_scores = []
fp_averages = []
fp_std = []

for idx, (a_0, a_2, a_4) in enumerate(rows):
    value, unit = units_df.iloc[idx]

    fp_score_0 = calculate_fp_score(a_0, value)
    fp_score_2 = calculate_fp_score(a_2, value)
    fp_score_4 = calculate_fp_score(a_4, value)

    fp_row = [fp_score_0, fp_score_2, fp_score_4]

    # Filter out None values
    filtered_fp_row = [score for score in fp_row if score != "Invalid Answer"]

    if filtered_fp_row:
        std = np.std(filtered_fp_row)
        average = np.mean(filtered_fp_row)
    else:
        std = None
        average = None

    fp_scores.append(fp_row)
    fp_averages.append(average)
    fp_std.append(std)

# Overall and Level Average and Mean Results



In [14]:
def calculate_level_avg_std(level, fp_scores):

    level = level // 2  # Since level will be 0, 2 or 4
    filtered_fp_row = []

    for row in fp_scores:
        if row[level] != "Invalid Answer":
            filtered_fp_row.append(row[level])

    if filtered_fp_row:
        std = np.std(filtered_fp_row)
        average = np.mean(filtered_fp_row)
    else:
        std = None
        average = None

    return average, std


def calculate_overall_avg_std(fp_scores):
    filtered_fp_row = []

    for row in fp_scores:
        for score in row:
            if score != "Invalid Answer":
                filtered_fp_row.append(score)

    if filtered_fp_row:
        std = np.std(filtered_fp_row)
        average = np.mean(filtered_fp_row)

    else:
        std = None
        average = None

    return average, std

In [15]:
overall_average, overall_std = calculate_overall_avg_std(fp_scores)
level_0_average, level_0_std = calculate_level_avg_std(0, fp_scores)
level_2_average, level_2_std = calculate_level_avg_std(2, fp_scores)
level_4_average, level_4_std = calculate_level_avg_std(4, fp_scores)


print(
    f"Overall Results - Average: {overall_average}, Standard Deviation: {overall_std}"
)
print(
    f"Level 0 - Average: {level_0_average}, Standard Deviation: {level_0_std}"
)
print(
    f"Level 2 - Average: {level_2_average}, Standard Deviation: {level_2_std}"
)
print(
    f"Level 4 - Average: {level_4_average}, Standard Deviation: {level_4_std}"
)

Overall Results - Average: 0.49742570935920505, Standard Deviation: 0.38886527937471144
Level 0 - Average: 0.4932278117818382, Standard Deviation: 0.38873077093727976
Level 2 - Average: 0.50378139015012, Standard Deviation: 0.388095724845662
Level 4 - Average: 0.4952416063487219, Standard Deviation: 0.3896846679298247


## Write to a JSON File

### All FP Results

In [16]:
# Create a dictionary to represent the JSON structure
scores_dict = {}
for idx, score_list in enumerate(fp_scores):
    question = df.iloc[idx]["Fermi Question"]

    scores_dict[question] = {
        "Level 0": score_list[0],
        "Level 2": score_list[1],
        "Level 4": score_list[2],
        "Mean": fp_averages[idx],
        "Standard Dev": fp_std[idx],
    }

with open("fp_results.json", "w") as file:
    json.dump(scores_dict, file, indent=4)

# Overall Results JSON

In [17]:
with open("Fermi_Classification.json", "r") as file:
    data_classification = json.load(file)

with open("fp_results.json", "r") as file:
    fp_results_data = json.load(file)

In [18]:
llama_70b_results = {}

for idx, (question, data) in enumerate(overall_data.items()):
    type_question = (
        "specific"
        if question in data_classification["specific"]
        else "standard"
    )
    fp_question_results = fp_results_data[question]
    extracted_row = rows[idx]

    llama_70b_results[question] = {
        "Level 0": {
            "Raw Answer": data["Level 0 Answer"],
            "Extracted Answer": str(extracted_row[0]),
            "Level FP Score": fp_question_results["Level 0"],
        },
        "Level 2": {
            "Prompt": data["Level 2 Prompt"],
            "Raw Answer": data["Level 2 Answer"],
            "Extracted Answer": str(extracted_row[1]),
            "Level FP Score": fp_question_results["Level 2"],
        },
        "Level 4": {
            "Prompt": data["Level 4 Prompt"],
            "Raw Answer": data["Level 4 Answer"],
            "Extracted Answer": str(extracted_row[2]),
            "Level FP Score": fp_question_results["Level 4"],
        },
        "Type": type_question,
        "FP Score": {
            "Mean": fp_question_results["Mean"],
            "Standard Dev": fp_question_results["Standard Dev"],
        },
    }

with open("Llama3_70B_results.json", "w") as file:
    json.dump(llama_70b_results, file, indent=4)

### FP Scores Depending on the type

In [19]:
standard_fp_scores = []
specific_fp_scores = []

for question, data in llama_70b_results.items():
    new_row = [
        data["Level 0"]["Level FP Score"],
        data["Level 2"]["Level FP Score"],
        data["Level 4"]["Level FP Score"],
    ]
    if data["Type"] == "specific":
        specific_fp_scores.append(new_row)

    elif data["Type"] == "standard":
        standard_fp_scores.append(new_row)

    else:
        # Raise error
        raise ()

standard_level_0_average, standard_level_0_std = calculate_level_avg_std(
    0, standard_fp_scores
)
standard_level_2_average, standard_level_2_std = calculate_level_avg_std(
    2, standard_fp_scores
)
standard_level_4_average, standard_level_4_std = calculate_level_avg_std(
    4, standard_fp_scores
)
standard_overall_average, standard_overall_std = calculate_overall_avg_std(
    standard_fp_scores
)

specific_level_0_average, specific_level_0_std = calculate_level_avg_std(
    0, specific_fp_scores
)
specific_level_2_average, specific_level_2_std = calculate_level_avg_std(
    2, specific_fp_scores
)
specific_level_4_average, specific_level_4_std = calculate_level_avg_std(
    4, specific_fp_scores
)
specific_overall_average, specific_overall_std = calculate_overall_avg_std(
    specific_fp_scores
)


print("Standard Questions:")
print(
    f"\tLevel 0 - Average: {standard_level_0_average}, Standard Deviation: {standard_level_0_std}"
)
print(
    f"\tLevel 2 - Average: {standard_level_2_average}, Standard Deviation: {standard_level_2_std}"
)
print(
    f"\tLevel 4 - Average: {standard_level_4_average}, Standard Deviation: {standard_level_4_std}"
)
print(
    f"\tOverall Results - Average: {standard_overall_average}, Standard Deviation: {standard_overall_std}"
)

print("Specific Questions:")
print(
    f"\tLevel 0 - Average: {specific_level_0_average}, Standard Deviation: {specific_level_0_std}"
)
print(
    f"\tLevel 2 - Average: {specific_level_2_average}, Standard Deviation: {specific_level_2_std}"
)
print(
    f"\tLevel 4 - Average: {specific_level_4_average}, Standard Deviation: {specific_level_4_std}"
)
print(
    f"\tOverall Results - Average: {specific_overall_average}, Standard Deviation: {specific_overall_std}"
)

Standard Questions:
	Level 0 - Average: 0.49720958333502374, Standard Deviation: 0.39090551415991964
	Level 2 - Average: 0.4937701613146985, Standard Deviation: 0.39795187928994363
	Level 4 - Average: 0.48152216366366646, Standard Deviation: 0.3941364925044105
	Overall Results - Average: 0.4908288069973935, Standard Deviation: 0.3944020534604114
Specific Questions:
	Level 0 - Average: 0.48153846815070084, Standard Deviation: 0.38203520522486284
	Level 2 - Average: 0.5326249305710552, Standard Deviation: 0.35661797926999345
	Level 4 - Average: 0.5344944562531858, Standard Deviation: 0.37388657064744785
	Overall Results - Average: 0.5165057431222898, Standard Deviation: 0.37172995489235194


## Load the JSON With the Overall Results

In [20]:
fp_results_json = {
    "All Questions": {
        "Overall Results": {
            "Average": f"{overall_average}",
            "Standard Dev": f"{overall_std}",
        },
        "Level 0": {
            "Average": f"{level_0_average}",
            "Standard Dev": f"{level_0_std}",
        },
        "Level 2": {
            "Average": f"{level_2_average}",
            "Standard Dev": f"{level_2_std}",
        },
        "Level 4": {
            "Average": f"{level_4_average}",
            "Standard Dev": f"{level_4_std}",
        },
    },
    "Standard Questions": {
        "Overall Results": {
            "Average": f"{standard_overall_average}",
            "Standard Dev": f"{standard_overall_std}",
        },
        "Level 0": {
            "Average": f"{standard_level_0_average}",
            "Standard Dev": f"{standard_level_0_std}",
        },
        "Level 2": {
            "Average": f"{standard_level_2_average}",
            "Standard Dev": f"{standard_level_2_std}",
        },
        "Level 4": {
            "Average": f"{standard_level_4_average}",
            "Standard Dev": f"{standard_level_4_std}",
        },
    },
    "Specific Questions": {
        "Overall Results": {
            "Average": f"{specific_overall_average}",
            "Standard Dev": f"{specific_overall_std}",
        },
        "Level 0": {
            "Average": f"{specific_level_0_average}",
            "Standard Dev": f"{specific_level_0_std}",
        },
        "Level 2": {
            "Average": f"{specific_level_2_average}",
            "Standard Dev": f"{specific_level_2_std}",
        },
        "Level 4": {
            "Average": f"{specific_level_4_average}",
            "Standard Dev": f"{specific_level_4_std}",
        },
    },
}

# Write the JSON object to a file
with open("fp_overall_results.json", "w") as json_file:
    json.dump(fp_results_json, json_file, indent=4)