In [1]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset

mmlu_auxiliary_data = []
arc_easy_data = []
scienceqa_data = []
mathqa_data = []
openbookqa_data = []

In [2]:
# List of STEM-related MMLU subsets
stem_subsets = [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_physics",
    "computer_security",
    "conceptual_physics",
    "electrical_engineering",
    "elementary_mathematics",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_computer_science",
    "high_school_mathematics",
    "high_school_physics",
    "high_school_statistics",
    "machine_learning",
]

data = load_dataset("kz919/mmlu-auxiliary-train-auto-labelled", split="train")
int_to_char_ans = {0: "A", 1: "B", 2: "C", 3: "D"}
cnt = 0
for data_point in data:
    if data_point["task"] not in stem_subsets:
        continue
    mmlu_auxiliary_data.append({
        "dataset": "kz919/mmlu-auxiliary-train-auto-labelled",
        "id": f"mmlu_auxiliary_train_auto_labelled_{cnt}",
        "question": data_point["question"],
        "choices": data_point["choices"],
        "answer": int_to_char_ans[data_point["answer"]]
    })
    cnt += 1

In [3]:
data = load_dataset("cais/mmlu", 'all', split="validation")
int_to_char_ans = {0: "A", 1: "B", 2: "C", 3: "D"}
cnt = 0
mmlu_validation_data = []
for data_point in data:
    if data_point["subject"] not in stem_subsets:
        continue
    mmlu_validation_data.append({
        "dataset": "cais/mmlu",
        "id": f"mmlu_{cnt}",
        "question": data_point["question"],
        "choices": data_point["choices"],
        "answer": int_to_char_ans[data_point["answer"]]
    })
    cnt += 1

In [4]:
data = load_dataset("allenai/ai2_arc", "ARC-Easy", split="train")
for data_point in data:
    arc_easy_data.append({
        "dataset": "allenai/ai2_arc",
        "id": f"arc_easy_{data_point['id']}",
        "question": data_point["question"],
        "choices": data_point["choices"]["text"],
        "answer": data_point["answerKey"]
    })

In [5]:
data = load_dataset("allenai/ai2_arc", "ARC-Easy", split="validation")
arc_easy_data_validation = []
for data_point in data:
    arc_easy_data_validation.append({
        "dataset": "allenai/ai2_arc",
        "id": f"arc_easy_{data_point['id']}",
        "question": data_point["question"],
        "choices": data_point["choices"]["text"],
        "answer": data_point["answerKey"]
    })

In [6]:
data = load_dataset("derek-thomas/ScienceQA", split="train")
cnt = 0
for data_point in data:
    if data_point["image"] is not None or data_point["subject"] != "natural science" or data_point["task"] != "closed choice":
        continue
    scienceqa_data.append({
        "dataset": "derek-thomas/ScienceQA",
        "id": f"scienceqa_{cnt}",
        "question": data_point["question"],
        "choices": data_point["choices"],
        "answer": int_to_char_ans[data_point["answer"]]
    })
    cnt += 1

data = load_dataset("derek-thomas/ScienceQA", split="test")
cnt = 0
for data_point in data:
    if data_point["image"] is not None or data_point["subject"] != "natural science" or data_point["task"] != "closed choice":
        continue
    scienceqa_data.append({
        "dataset": "derek-thomas/ScienceQA",
        "id": f"scienceqa_{cnt}",
        "question": data_point["question"],
        "choices": data_point["choices"],
        "answer": int_to_char_ans[data_point["answer"]]
    })
    cnt += 1

In [7]:
data = load_dataset("derek-thomas/ScienceQA", split="validation")
cnt = 0
scienceqa_data_validation = []
for data_point in data:
    if data_point["image"] is not None or data_point["subject"] != "natural science" or data_point["task"] != "closed choice":
        continue
    scienceqa_data_validation.append({
        "dataset": "derek-thomas/ScienceQA",
        "id": f"scienceqa_{cnt}",
        "question": data_point["question"],
        "choices": data_point["choices"],
        "answer": int_to_char_ans[data_point["answer"]]
    })
    cnt += 1

In [8]:
import re

data = load_dataset("allenai/math_qa", split="train")
cnt = 0
char_to_char_ans = {
    'a': "A",
    'b': "B",
    'c': "C",
    'd': "D",
    'e': "E"
}

def extract_choices(choices_str):
    matches = re.findall(r'[a-e]\s*\)\s*([^,]+)', choices_str)
    # Clean up whitespace and dots
    res = [m.strip().replace(' .', '.').replace(' ,', ',') for m in matches]
    return res

for data_point in data:
    mathqa_data.append({
        "dataset": "allenai/math_qa",
        "id": f"mathqa_{cnt}",
        "question": data_point["Problem"],
        "choices": extract_choices(data_point["options"]),
        "answer": char_to_char_ans[data_point["correct"]],
        "context": data_point["Rationale"],
    })
    cnt += 1

In [9]:
data = load_dataset("allenai/math_qa", split="validation")
cnt = 0

mathqa_data_validation = []

for data_point in data:
    mathqa_data_validation.append({
        "dataset": "allenai/math_qa",
        "id": f"mathqa_{cnt}",
        "question": data_point["Problem"],
        "choices": extract_choices(data_point["options"]),
        "answer": char_to_char_ans[data_point["correct"]],
        "context": data_point["Rationale"],
    })
    cnt += 1

In [12]:
data = load_dataset("allenai/openbookqa", "additional", split="train")
cnt = 0

openbookqa_data = []

for data_point in data:
    openbookqa_data.append({
        "dataset": "allenai/openbookqa",
        "id": f"openbookqa_{cnt}",
        "question": data_point["question_stem"],
        "choices": data_point["choices"]["text"],
        "answer": data_point["answerKey"],
        "context": data_point["fact1"],
    })
    cnt += 1

In [14]:
data = load_dataset("allenai/openbookqa", "additional", split="validation")
cnt = 0

openbookqa_data_validation = []

for data_point in data:
    openbookqa_data_validation.append({
        "dataset": "allenai/openbookqa",
        "id": f"openbookqa_{cnt}",
        "question": data_point["question_stem"],
        "choices": data_point["choices"]["text"],
        "answer": data_point["answerKey"],
        "context": data_point["fact1"],
    })
    cnt += 1

data = load_dataset("allenai/openbookqa", "additional", split="test")
openbookqa_data_validation = []

for data_point in data:
    openbookqa_data_validation.append({
        "dataset": "allenai/openbookqa",
        "id": f"openbookqa_{cnt}",
        "question": data_point["question_stem"],
        "choices": data_point["choices"]["text"],
        "answer": data_point["answerKey"],
        "context": data_point["fact1"],
    })
    cnt += 1

In [15]:
def push_to_hf(subset_name, train_dataset, validation_dataset=None):
    train_dataset = Dataset.from_list(train_dataset)
    if validation_dataset is not None:
        validation_dataset = Dataset.from_list(validation_dataset)
        dataset_dict = DatasetDict({
            "train": train_dataset,
            "validation": validation_dataset
        })
    else:
        dataset_dict = DatasetDict({
            "train": train_dataset
        })
    dataset_dict.push_to_hub("igzi/MNLP_M2_mcqa_dataset", config_name=subset_name)

In [16]:
push_to_hf("MMLU", mmlu_auxiliary_data, mmlu_validation_data)
push_to_hf("ARC-Easy", arc_easy_data, arc_easy_data_validation)
push_to_hf("ScienceQA", scienceqa_data, scienceqa_data_validation)
push_to_hf("MathQA", mathqa_data, mathqa_data_validation)
push_to_hf("OpenBookQA", openbookqa_data, openbookqa_data_validation)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]