In [35]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
import json
import random

# unsloth/OpenMathReasoning

In [36]:
reasoning_dataset_hf = load_dataset("unsloth/OpenMathReasoning", split="cot")
reasoning_dataset_hf = reasoning_dataset_hf.remove_columns(
    [col for col in reasoning_dataset_hf.column_names if col not in ["problem", "generated_solution"]]
)
def prepare_reasoning_dataset_hf(example):
    with open("open_qsts_prompts.json", "r") as f:
        prompts = json.load(f)
    prompt = random.choice(prompts)
    return {
        "question": prompt.replace("<question>", example["problem"]),
        "answer": example["generated_solution"],
        "source": "unsloth/OpenMathReasoning",
    }

reasoning_dataset_hf = reasoning_dataset_hf.map(prepare_reasoning_dataset_hf, remove_columns=["problem", "generated_solution"])
reasoning_dataset_hf

Dataset({
    features: ['question', 'answer', 'source'],
    num_rows: 192523
})

# Finetome

In [37]:
conversational_data = load_dataset("mlabonne/FineTome-100k")["train"]
conversational_data = conversational_data.filter(lambda x: len(x["conversations"]) == 2)
def clean_conversational_data(example):
    question = example["conversations"][0]["value"]
    answer = example["conversations"][1]["value"]
    return {
        "source": "mlabonne/FineTome-100k",
        "question": question,
        "answer": answer,
    }

conversational_data = conversational_data.map(clean_conversational_data, remove_columns=conversational_data.column_names)
conversational_data

Dataset({
    features: ['source', 'question', 'answer'],
    num_rows: 77831
})

# TULU

In [38]:
tulu_dataset_math = load_dataset("allenai/tulu-3-sft-personas-math")["train"]
tulu_dataset_code = load_dataset("allenai/tulu-3-sft-personas-code")["train"]
tulu_dataset_algebra = load_dataset("allenai/tulu-3-sft-personas-algebra")["train"]
tulu_dataset_math_grade = load_dataset("allenai/tulu-3-sft-personas-math-grade")["train"]

def clean_tulu_data(example):
    question = example["messages"][0]["content"]
    answer = example["messages"][1]["content"]
    return {
        "source": "allenai/tulu-3-sft-mixture",
        "question": question,
        "answer": answer,
    }

tulu_dataset_math = tulu_dataset_math.map(clean_tulu_data, remove_columns=tulu_dataset_math.column_names)
tulu_dataset_code = tulu_dataset_code.map(clean_tulu_data, remove_columns=tulu_dataset_code.column_names)
tulu_dataset_algebra = tulu_dataset_algebra.map(clean_tulu_data, remove_columns=tulu_dataset_algebra.column_names)
tulu_dataset_math_grade = tulu_dataset_math_grade.map(clean_tulu_data, remove_columns=tulu_dataset_math_grade.column_names)



# MCQs

In [47]:
mcq_dataset = load_dataset("HAissa/MCQ_dataset")["train"]
mcq_dataset = mcq_dataset.filter(lambda x: x["source"] != "deepmind/aqua_rat")
def add_mcq_prompts(example):
    question = example["question"]
    answer = example["answer"]
    question = question.replace("\nA)", "\nA.")
    question = question.replace("\nB)", "\nB.")
    question = question.replace("\nC)", "\nC.")
    question = question.replace("\nD)", "\nD.")
    question = question.replace("\nE)", "\nE.")
    answer = answer.replace("A)", "A.")
    answer = answer.replace("B)", "B.")
    answer = answer.replace("C)", "C.")
    answer = answer.replace("D)", "D.")
    answer = answer.replace("E)", "E.")
    return {
        "source": example["source"],
        "question": question,
        "answer": answer,
    }
mcq_dataset = mcq_dataset.map(add_mcq_prompts, remove_columns=mcq_dataset.column_names)
mcq_dataset

README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/149M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/775k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/331322 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5280 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/19388 [00:00<?, ? examples/s]

Filter:   0%|          | 0/331322 [00:00<?, ? examples/s]

Map:   0%|          | 0/233855 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'source'],
    num_rows: 233855
})

# aquarat

In [41]:
aqua_dataset = load_dataset("deepmind/aqua_rat")["train"]
def prepare_aqua_data(example):
    options = example["options"]
    question = example["question"]
    rationale = example["rationale"]
    correct_answer = example["correct"]
    for choice in options:
        question+= f"\n{choice}"
    question = question.replace("\nA)", "\nA.")
    question = question.replace("\nB)", "\nB.")
    question = question.replace("\nC)", "\nC.")
    question = question.replace("\nD)", "\nD.")
    question = question.replace("\nE)", "\nE.")
    answer_value = options[ord(correct_answer) - ord('A')]
    answer_value = answer_value.replace("A)", "A.")
    answer_value = answer_value.replace("B)", "B.")
    answer_value = answer_value.replace("C)", "C.")
    answer_value = answer_value.replace("D)", "D.")
    answer_value = answer_value.replace("E)", "E.")
    correct_answer = f"{answer_value}\nExplanation: {rationale}"
    return {
        "source": "deepmind/aqua_rat",
        "question": question,
        "answer": correct_answer,
    }

aqua_dataset = aqua_dataset.map(prepare_aqua_data, remove_columns=aqua_dataset.column_names)
aqua_dataset[0]


{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?\nA.21\nB.21.5\nC.22\nD.22.5\nE.23",
 'source': 'deepmind/aqua_rat',
 'answer': 'E.23\nExplanation: If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.'}

# GSM8k

In [42]:
gsm8k = load_dataset("openai/gsm8k", "main")["train"]
def prepare_gsm8k_data(example):
    question = example["question"]
    answer = example["answer"]
    return {
        "source": "cais/gsm8k",
        "question": question,
        "answer": answer,
    }

gsm8k = gsm8k.map(prepare_gsm8k_data, remove_columns=gsm8k.column_names)

# stack math

In [43]:
stack_math = load_dataset("math-ai/StackMathQA", "stackmathqa400k")["train"]
def prepare_stack_math_data(example):
    question = example["Q"]
    answer = example["A"]
    return {
        "source": "math-ai/StackMathQA",
        "question": question,
        "answer": answer,
    }

stack_math = stack_math.map(prepare_stack_math_data, remove_columns=stack_math.column_names)


# Combine all

In [44]:
combined_train_dataset = concatenate_datasets([
    tulu_dataset_math,
    tulu_dataset_code,
    tulu_dataset_algebra,
    tulu_dataset_math_grade,
    mcq_dataset,
    aqua_dataset,
    gsm8k,
    stack_math,
    reasoning_dataset_hf
])

combined_train_dataset = combined_train_dataset.shuffle(seed=42)
combined_train_dataset

Dataset({
    features: ['source', 'question', 'answer'],
    num_rows: 1186257
})

In [45]:
validation = load_dataset("HAissa/MCQ_dataset")["validation"]
test = load_dataset("HAissa/MCQ_dataset")["test"]
total_dataset = DatasetDict({
    "train": combined_train_dataset,
    "validation": validation,
    "test": test
})
total_dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'question', 'answer'],
        num_rows: 1186257
    })
    validation: Dataset({
        features: ['question', 'answer', 'source'],
        num_rows: 5280
    })
    test: Dataset({
        features: ['question', 'answer', 'source'],
        num_rows: 19388
    })
})

In [46]:
total_dataset.push_to_hub("HAissa/MNLP_M3_mcqa_dataset")

Uploading the dataset shards:   0%|          | 0/8 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/149 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/HAissa/MNLP_M3_mcqa_dataset/commit/66de2783860a41fd71277b4d04e834c83d866ab7', commit_message='Upload dataset', commit_description='', oid='66de2783860a41fd71277b4d04e834c83d866ab7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/HAissa/MNLP_M3_mcqa_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='HAissa/MNLP_M3_mcqa_dataset'), pr_revision=None, pr_num=None)