# Reasoning Dataset Creation

## Libraries

In [None]:
from datasets import load_dataset
import pandas as pd
import os
import random
import tqdm as tqdm
import re

## Final Reasoning Dataset Structure

In [None]:
df_final = pd.DataFrame()

# Set the Columns Names
columns = ['question', 'reasoning', 'answer', 'source']

df_final = pd.DataFrame(columns=columns)

## NuminaMath-CoT

In [None]:
print("Loading NuminaMath-CoT dataset...")
ds_math = load_dataset("AI-MO/NuminaMath-CoT")
print("Dataset loaded successfully.")

# Load the Train Set
print("Converting train set to pandas DataFrame...")
train_df_math = ds_math["train"].to_pandas()
print("Train set converted successfully.")

In [None]:
# Decrease the size of the dataset to 100k, but shufle it
#train_df_math = train_df_math.sample(n=100000, random_state=42).reset_index(drop=True)

def extract_answer(solution, question):
    """
    Extract the answer from the solution string.
    1. Finds the first occurrence of \boxed{...} (with or without $), even if it spans multiple lines and braces.
    2. If not found, searches for 'Conclusion' and returns everything below it.
    3. If not found, finds $\blacksquare$ and returns the two lines above it.
    4. If the answer is a single letter (A, B, C, D), extract the corresponding option from the question.
    """
    # Improved boxed extraction to handle nested braces
    match = re.search(r"\$?\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}\$?", solution, re.DOTALL)
    if match:
        answer = match.group(1).strip()
    else:
        # If not found, look for 'Conclusion'
        conclusion_match = re.search(r"Conclusion[:\-]?\s*(.*)", solution, re.IGNORECASE | re.DOTALL)
        if conclusion_match:
            answer = conclusion_match.group(1).strip()
        else:
            # If not found, look for $\blacksquare$ and return two lines above
            blacksquare_match = re.search(r"\$\\blacksquare\$", solution)
            if blacksquare_match:
                lines = solution[:blacksquare_match.start()].splitlines()
                if len(lines) >= 2:
                    answer = "\n".join(lines[-2:]).strip()
                elif lines:
                    answer = lines[-1].strip()
                else:
                    answer = None
            else:
                answer = None

    # If answer is a single letter (A, B, C, D), extract the corresponding option from the question
    if answer and re.search(r"[A-D]", answer):
        answer = re.sub(r"[^A-D]", "", answer).strip()
        option_match = re.search(rf"{answer}:\s*(.*)", question)
        if option_match:
            return f"{answer}. {option_match.group(1).strip()}"
    else:
        return None
    



# Iterate over the train set and append to the final DataFrame
print(train_df_math.shape)
filtered_df_math = pd.DataFrame(columns=columns)

rows = []
removed_rows = 0
for index, row in tqdm.tqdm(train_df_math.iterrows(), total=len(train_df_math)):
    answer = extract_answer(row['solution'], row['problem'])
    if answer is None:
        removed_rows += 1
        continue
    else:
        question = row['problem']
        reasoning = row['solution']
        source = row['source']
        rows.append({
            'question': question,
            'reasoning': reasoning,
            'answer': answer,
            'source': source
        })

filtered_df_math = pd.DataFrame(rows, columns=columns)
    
    
print(f"Removed {removed_rows} rows without valid answers.")
print(f"Filtered DataFrame shape: {filtered_df_math.shape}") 


In [None]:
# Print 1st row each column
print("First row of each column:")
for col in columns:
    print(f"{col}: {filtered_df_math[col].iloc[0]}")

## allenai/sciq

In [None]:
print("Loading NuminaMath-CoT dataset...")
ds_sciq = load_dataset("allenai/sciq")
print("Dataset loaded successfully.")

# Load the Train Set
print("Converting train set to pandas DataFrame...")
train_df_sciq = ds_sciq["train"].to_pandas()
print("Train set converted successfully.")

In [None]:
def create_options(distractor1, distractor2, distractor3, answer):
    """
    Shuffle and create a string of options in the format:
    A: option1
    B: option2
    C: option3
    D: option4
    Also returns the correct option label (A/B/C/D) and the answer string.
    """
    options = [distractor1, distractor2, distractor3, answer]
    random.shuffle(options)
    option_labels = ['A', 'B', 'C', 'D']
    options_str_list = []
    answer_label = None
    for idx, opt in enumerate(options):
        label = option_labels[idx]
        options_str_list.append(f"{label}: {opt}")
        if opt == answer:
            answer_label = label
    options_str = "\n".join(options_str_list)
    
    answer = f"{answer_label}: {answer}"
    return options_str, answer
    
filtered_df_sciq = pd.DataFrame(columns=columns)
rows = []
for index, row_sciq in tqdm.tqdm(train_df_sciq.iterrows(), total=len(train_df_sciq)):
    distractor1 = row_sciq['distractor1']
    distractor2 = row_sciq['distractor2']
    distractor3 = row_sciq['distractor3']
    answer = row_sciq['correct_answer']
    question = row_sciq['question']
    reasoning = row_sciq['support']
    options = create_options(distractor1, distractor2, distractor3, answer)
    
    rows.append({
        'question': question + "\n" + options[0],
        'reasoning': reasoning,
        'answer': options[1],
        'source': "SciQ"
    })
filtered_df_sciq = pd.DataFrame(rows, columns=columns)
print(f"Filtered SciQ DataFrame shape: {filtered_df_sciq.shape}")
    

In [None]:
# Print 1st row each column
print("First row of each column:")
for col in columns:
    print(f"{col}: {filtered_df_sciq[col].iloc[0]}")

## deepmind/aqua_rat

In [None]:
print("Loading NuminaMath-CoT dataset...")
ds_rat = load_dataset("deepmind/aqua_rat")
print("Dataset loaded successfully.")

# Load the Train Set
print("Converting train set to pandas DataFrame...")
train_df_rat = ds_rat["train"].to_pandas()
print("Train set converted successfully.")

In [None]:
def format_options_and_answer(options, answer):
    """
    Options is as ["A)21","B)21.5","C)22","D)22.5","E)23"] and I want it as A: 21\nB: 21.5\nC: 22\nD: 22.5\nE: 23
    Answer is as E and I want it as E: 23
    """
    options_str_list = []
    for option in options:
        label, value = option.split(")", 1)
        label = label.strip()
        value = value.strip()
        if answer == label:
            answer = f"{label}: {value}"
        options_str_list.append(f"{label}: {value}")
    options_str = "\n".join(options_str_list)
    
    return options_str, answer


train_df_rat = train_df_rat.sample(n=20000, random_state=42).reset_index(drop=True)
filtered_df_rat = pd.DataFrame(columns=columns)
rows = []
for index, row_rat in tqdm.tqdm(train_df_rat.iterrows(), total=len(train_df_rat)):
    question = row_rat['question']
    reasoning = row_rat['rationale']
    options = row_rat['options']
    answer = row_rat['correct']
    options_str, answer = format_options_and_answer(options, answer)
    
    rows.append({
        'question': question + "\n" + options_str,
        'reasoning': reasoning,
        'answer': answer,
        'source': "AQUA-RAT"
    })
filtered_df_rat = pd.DataFrame(rows, columns=columns)
print(f"Filtered AQUA-RAT DataFrame shape: {filtered_df_rat.shape}")
    
    
    

In [None]:
# Print 1st row each column
print("First row of each column:")
for col in columns:
    print(f"{col}: {filtered_df_rat[col].iloc[0]}")

## openlifescienceai/medmcqa

In [None]:
print("Loading NuminaMath-CoT dataset...")
ds_med = load_dataset("openlifescienceai/medmcqa")
print("Dataset loaded successfully.")

# Load the Train Set
print("Converting train set to pandas DataFrame...")
train_df_med = ds_med["train"].to_pandas()
print("Train set converted successfully.")

# Filter out the rows which has exp as null
train_df_med = train_df_med[train_df_med['exp'].notnull()].reset_index(drop=True)

# Filter out the choice_type to single
train_df_med = train_df_med[train_df_med['choice_type'] == 'single'].reset_index(drop=True)

In [None]:
train_df_med = train_df_med.sample(n=10000, random_state=42).reset_index(drop=True)
filtered_df_med = pd.DataFrame(columns=columns)
rows = []
for index, row_med in tqdm.tqdm(train_df_med.iterrows(), total=len(train_df_med)):
    option_a = row_med['opa']
    option_b = row_med['opb']
    option_c = row_med['opc']
    option_d = row_med['opd']
    answer = row_med['cop']
    if answer == 0:
        answer = "A: " + option_a
    elif answer == 1:
        answer = "B: " + option_b
    elif answer == 2:
        answer = "C: " + option_c
    elif answer == 3:
        answer = "D: " + option_d
    question = row_med['question']
    reasoning = row_med['exp']
    options_str = f"A: {option_a}\nB: {option_b}\nC: {option_c}\nD: {option_d}"
    rows.append({
        'question': question + "\n" + options_str,
        'reasoning': reasoning,
        'answer': answer,
        'source': "MedMCQA"
    })
filtered_df_med = pd.DataFrame(rows, columns=columns)
print(f"Filtered MedMCQA DataFrame shape: {filtered_df_med.shape}")

In [None]:
# Print 1st row each column
print("First row of each column:")
for col in columns:
    print(f"{col}: {filtered_df_med[col].iloc[0]}")
    

## Making the Final Dataset

In [None]:
df_final = pd.concat([filtered_df_math, filtered_df_sciq, filtered_df_rat, filtered_df_med], ignore_index=True)
# Shuffle the final DataFrame
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final DataFrame shape: {df_final.shape}")
print(df_final.head())

In [None]:
# Split the final DataFrame into train and test sets
train_size = int(0.95 * len(df_final))
df_train = df_final[:train_size]
df_test = df_final[train_size:]
# Save the train and test sets to panquart files
train_file = "reasoning_dataset_train.parquet"
test_file = "reasoning_dataset_test.parquet"
directory = "datasets"
if not os.path.exists(directory):
    os.makedirs(directory)
train_path = os.path.join(directory, train_file)
test_path = os.path.join(directory, test_file)
df_train.to_parquet(train_path, index=False)
df_test.to_parquet(test_path, index=False)
print(f"Train DataFrame saved to {train_path}")
print(f"Test DataFrame saved to {test_path}")
