# Clean the dataset
This notebook clean the dataset:
- Convert the answer of open-end numerical questions into e-notation (e.g. from `2 × 10^-3` to `2e-3`)
- Remove duplicate question/answer pairs

### Import

In [1]:
import pandas as pd
import numpy as np

import re
import json

### Read JSON

In [2]:
df = pd.read_json("merged_formatted_dataset.json")
open_end_numerical_df = df[df["options"].apply(lambda x: len(x) == 0)].copy()
others = df[df["options"].apply(lambda x: len(x) != 0)].copy()

ans = open_end_numerical_df["answer"]

### Formatting
We use the following script to convert the answer of the __open-end numerical questions__ to e notation, e.g. from `2 × 10^-3` to `2e-3`. 

This change allows future programme to compare the model's output with the dataset's output by using something like `float('2e-3') == 0.002`

In [3]:
def replace_superscript(text):
    # Mapping of superscript digits to regular digits
    superscript_map = {
        "⁰": "0",
        "¹": "1",
        "²": "2",
        "³": "3",
        "⁴": "4",
        "⁵": "5",
        "⁶": "6",
        "⁷": "7",
        "⁸": "8",
        "⁹": "9",
        "⁻": "-",  # Unicode superscript minus to regular minus
    }

    split_text = text.split(" ")

    # Create a translation table for str.translate()
    trans_table = str.maketrans(superscript_map)

    # Replace superscripts in the text
    split_text[0] = (
        split_text[0].translate(trans_table)
        if re.search(r"\d", split_text[0])
        else split_text[0]
    )

    out = " ".join(split_text)
    return out


def replace_to_e_notation(answer):
    return replace_superscript(
        answer.replace("×10^", "e")
        .replace(" × 10^", "e")
        .replace("×10", "e")
        .replace(" × 10", "e")
        .replace("10^", "1e")
    )

def format_power_10(answer):
    answer = replace_superscript(
        answer.replace("×10^", "*10^")
        .replace(" × 10^", "*10^")
        .replace("×10", "*10^")
        .replace(" × 10", "*10^")
        .replace(" x 10^", "*10^")
    )

    split_text = answer.split(' ')
    # replace e-notations
    if 'e' in split_text[0]:
        split_text[0] = split_text[0].replace("e", "*10^")
        answer = " ".join(split_text)

    # replace answers with pi into latex form
    if 'π' in split_text[0]:
        split_text[0] = split_text[0].replace('π', '\pi')
        split_text[0] = '$' + split_text[0] + '$'
        answer = " ".join(split_text)

    return answer

ans = ans.apply(format_power_10)

Then we merge it back to the original dataset

In [4]:
open_end_numerical_df["answer"] = ans

In [5]:
cleaned_dataset = pd.concat([open_end_numerical_df, others]).reset_index(drop=True)

In [6]:
cleaned_dataset

Unnamed: 0,question,options,CoT,answer
0,The plates of a capacitor are charged to a pot...,[],,0.01 C
1,How much work is required to charge a 10 µF ca...,[],,0.05 J
2,"A soccer ball, at rest on the ground, is kicke...",[],,1 s
3,A solid sphere (I = 0.06 kg·m^2) spins freely ...,[],,2 s
4,A beam of electrons has speed 10^7 m/s. It is ...,[],,1 m
...,...,...,...,...
1992,The magnetic field lines about a current-carry...,"[circles, radial lines, eddy currents, energy ...",,circles
1993,An inflated balloon with a heavy rock tied to ...,"[increases., decreases., remains largely uncha...",,decreases.
1994,Boiling and freezing occur when water is subje...,"[decreased temperatures, decreased atmospheric...",,decreased atmospheric pressure
1995,When you bend the branch of a tree by hanging ...,"[tension., compression., Both., Neither.]",,tension.


### Remove duplicate question/answer pairs from the dataset

In [7]:
qa = cleaned_dataset[["question", "answer"]]
cleaned_dataset = cleaned_dataset[~qa.duplicated(keep="first")].copy()

### Clean the dataset

#### Manually format 3 questions

In [8]:
cleaned_dataset.loc[cleaned_dataset["answer"] == "16 000 J", "answer"] = "16000 J"

In [9]:
cleaned_dataset.loc[cleaned_dataset["answer"] == "36 750 N", "answer"] = "36750 N"

In [10]:
cleaned_dataset.loc[cleaned_dataset["answer"] == "-36m", "answer"] = "-36 m"

In [11]:
cleaned_dataset.loc[cleaned_dataset["answer"] == "32q.", "answer"] = "32q"

#### Break a question into two

In [12]:
data = cleaned_dataset.loc[cleaned_dataset["answer"] == "40 N, 2.67 m/s²"].copy()
data.reset_index(drop=True, inplace=True)
data.loc[0, "question"], data.loc[0, "CoT"], data.loc[0, "answer"]

('A 15 kg box is pushed with a force of 60 N. If the frictional force is 20 N, what is the net force and the resulting acceleration?',
 'Net force = 60 N − 20 N = 40 N; then a = F/m = 40 N ÷ 15 kg.',
 '40 N, 2.67 m/s²')

In [13]:
q1 = {}
q1["question"] = "A 15 kg box is pushed with a force of 60 N. If the frictional force is 20 N, what is the net force?"
q1["options"] = []
q1["CoT"] = "Net force = 60 N − 20 N = 40 N; then a = F/m = 40 N ÷ 15 kg."
q1["answer"] = "40 N"

q2 = {}
q2["question"] = "A 15 kg box is pushed with a force of 60 N. If the frictional force is 20 N, what is the resulting acceleration?"
q2["options"] = []
q2["CoT"] = "Net force = 60 N − 20 N = 40 N; then a = F/m = 40 N ÷ 15 kg."
q2["answer"] = "2.67 m/s²"


cleaned_dataset.drop(index=cleaned_dataset.loc[cleaned_dataset["answer"] == "40 N, 2.67 m/s²"].index, inplace=True)
cleaned_dataset = pd.concat([cleaned_dataset, pd.DataFrame([q1, q2])])

#### Break a question into two

In [14]:
data = cleaned_dataset.loc[cleaned_dataset["answer"] == "9 and the volume by 27."].copy()
data.reset_index(drop=True, inplace=True)
data.loc[0, "question"], data.loc[0, "CoT"], data.loc[0, "answer"]

('When you scale up an object to 3 times its linear size, the surface area increases by',
 '',
 '9 and the volume by 27.')

In [15]:
q1 = {}
q1["question"] = "When you scale up an object to 3 times its linear size, the surface area increases by () times."
q1["options"] = []
q1["CoT"] = ""
q1["answer"] = "9"

q2 = {}
q2["question"] = "When you scale up an object to 3 times its linear size, the volume increases by () times."
q2["options"] = []
q2["CoT"] = ""
q2["answer"] = "27"


cleaned_dataset.drop(index=cleaned_dataset.loc[cleaned_dataset["answer"] == "9 and the volume by 27."].index, inplace=True)
cleaned_dataset = pd.concat([cleaned_dataset, pd.DataFrame([q1, q2])])

#### Dealing with questions with "ohm"

In [16]:
def replace_ohm(answer):
    return answer.replace("ohms", "Ω")

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(replace_ohm)

In [17]:
def add_space_to_ohm(answer):
  if "Ω" in answer.split(' ')[0]:
    answer = answer.replace("Ω", " Ω")
    return answer
  return answer

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(add_space_to_ohm)

#### Replace `Joules` with `J`

In [18]:
def replace_joule(answer):
    return answer.replace("Joules", "J")

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(replace_joule)

#### Replace `m/s²`, `m/s2`, `m²`, `m³` to a good format, i.e. `xx^y`

In [19]:
def replace_msx(answer):
    return answer.replace("m/s2", "m/s^2").replace("m/s²", "m/s^2")

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(replace_msx)

In [20]:
def replace_mx(answer):
    return answer.replace("m²", "m^2").replace("m³", "m^3")

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(replace_mx)

#### Treat degree `°` as a unit

In [21]:
def add_space_to_degree(answer):
  if "°" in answer.split(' ')[0]:
    answer = answer.replace("°", " °")
    return answer
  return answer

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(add_space_to_degree)

#### Remove `.` from `minutes.` and `years.`

In [22]:
def remove_full_stop(answer):
  return answer.replace('minutes.', 'minutes').replace('years.', 'years')

cleaned_dataset['answer'] = cleaned_dataset['answer'].apply(remove_full_stop)

#### Rephase the question and answer

In [23]:
ans = "300 J out of the system"
ques = "A gas undergoes an expansion-compression cycle. If, plotted on a P-V diagram, the cycle is counterclockwise and the work is 300 J in magnitude, what was the heat transfer during this cycle? Take into the system as positive."
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "question"] = ques
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "-300 J"

In [24]:
ans = "25 m/s, downward"
ques = "A stone is thrown vertically upward with an initial speed of 5 m/s. What is the velocity of the stone 3 seconds later? Take upward as positive."
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "question"] = ques
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "-24.4 m/s"

In [25]:
ans = "100 times as much"
ques = "Imagine you're standing on the surface of a shrinking planet. If it shrinks to one-tenth its original diameter with no change in mass on the shrunken surface you'd weigh () times as much."
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "question"] = ques
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "100"

In [26]:
ans = "100 times greater"
ques = "Compared with a sound of 60 decibels, a sound of 80 decibels has an intensity () times greater."
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "question"] = ques
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "100"

In [27]:
ans = "4.18 J/g°C"
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "4184 J/(kg·K)"

In [28]:
ans = "tan–1 4"
cleaned_dataset.loc[cleaned_dataset["answer"] == ans, "answer"] = "75.96 °"

#### Remove a data as there are two same questions, but two with different unit in the answer

In [29]:
cleaned_dataset.drop(index=cleaned_dataset.loc[cleaned_dataset["answer"] == "7.2*10^5 V/m"].index, inplace=True)

#### Remove factual open-end question

In [30]:
cleaned_dataset.drop(index=cleaned_dataset.loc[cleaned_dataset["question"] == "What is the truth table for an XOR gate?"].index, inplace=True)

In [31]:
cleaned_dataset.reset_index(drop=True, inplace=True)
len(cleaned_dataset)

1997

### Save as JSON

In [32]:
with open("cleaned_dataset.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_dataset.to_dict('records'), f, indent=4, ensure_ascii=False)