In [6]:
# Script to load, sample, unify, and push MCQA datasets to HuggingFace Hub
# Replace with your huggingface username

HF_USERNAME = "NicoHelemon"
REPO_ID = f"{HF_USERNAME}/MNLP_M3_mcqa_dataset"

import os
os.environ["HF_TOKEN"] = "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"

from datasets import load_dataset, concatenate_datasets
import random
from collections import deque

# Desired subset sizes per dataset
SUBSETS = {
    'openbookqa': 4900,
    'sciq':      10000,
    'race':      50000,
    'mmlu_aux':  85100,
    'aqua_rat':  50000,
    'medmcqa' :  50000
}

# SUBSETS = {
#     'openbookqa': 10,
#     'sciq':       10,
#     #'race':       10,
#     'mmlu_aux':   10,
#     'aqua_rat':   10,
#     'medmcqa' :   10
# }

# 1. Load each dataset split (train) and sample subset
raw_datasets = {}
raw_datasets['openbookqa'] = load_dataset("allenai/openbookqa", name="additional", split='train')
raw_datasets['sciq']       = load_dataset("allenai/sciq", split='train')
raw_datasets['race']       = load_dataset("ehovy/race", 'all', split='train')
raw_datasets['mmlu_aux']   = load_dataset("cais/mmlu", name="all", split='auxiliary_train')
raw_datasets['aqua_rat']   = load_dataset("NicoHelemon/aqua_rat_4", split='train')
raw_datasets['medmcqa']    = load_dataset("openlifescienceai/medmcqa", split='train')

raw_val_datasets = {
    'openbookqa': load_dataset("allenai/openbookqa", name="additional", split='validation'),
    'sciq':       load_dataset("allenai/sciq", split='validation'),
    'race':       load_dataset("ehovy/race", 'all', split='validation'),
    'mmlu_aux':   load_dataset("cais/mmlu", name="all", split='validation'),
    'aqua_rat':   load_dataset("NicoHelemon/aqua_rat_4", split='validation'),
    'medmcqa' :   load_dataset("openlifescienceai/medmcqa", split='validation')
}

raw_test_datasets = {
    'openbookqa': load_dataset("allenai/openbookqa", name="additional", split='test'),
    'sciq':       load_dataset("allenai/sciq", split='test'),
    'race':       load_dataset("ehovy/race", 'all', split='test'),
    'mmlu_aux':   load_dataset("cais/mmlu", name="all", split='test'),
    'aqua_rat':   load_dataset("NicoHelemon/aqua_rat_4", split='test'),
    'medmcqa':    load_dataset("openlifescienceai/medmcqa", split='test')
}

actual_ratios = {name: len(raw_val_datasets[name]) / len(raw_datasets[name]) for name in raw_datasets}
val_ratio = min(0.05, min(actual_ratios.values()))

VAL_SUBSETS = {name: max(1, int(SUBSETS[name] * val_ratio)) for name in SUBSETS}

actual_test_ratios = {
    name: len(raw_test_datasets[name]) / len(raw_datasets[name])
    for name in raw_test_datasets
}
test_ratio = min(0.05, min(actual_test_ratios.values()))

# number of examples per test subset
TEST_SUBSETS = {
    name: max(1, int(SUBSETS[name] * test_ratio))
    for name in SUBSETS
}

# Sample each to the desired subset size
sampled_datasets = {}
for name, ds in raw_datasets.items():
    subset_size = SUBSETS[name]
    total = len(ds)
    if subset_size < total:
        ds_shuffled = ds.shuffle(seed=42)
        sampled = ds_shuffled.select(range(subset_size))
    else:
        sampled = ds
    sampled_datasets[name] = sampled
    print(f"{name}: selected {len(sampled)} of {total}")

sampled_val_datasets = {}
for name, ds in raw_val_datasets.items():
    subset_size = VAL_SUBSETS[name]
    ds_shuffled = ds.shuffle(seed=42)
    sampled_val_datasets[name] = ds_shuffled.select(range(subset_size))
    print(f"{name} val: selected {len(sampled_val_datasets[name])} of {len(ds)}")

sampled_test_datasets = {}
for name, ds in raw_test_datasets.items():
    subset_size = TEST_SUBSETS.get(name, len(ds))
    ds_shuffled = ds.shuffle(seed=42)
    sampled_test_datasets[name] = ds_shuffled.select(range(subset_size))

# 2. Mapping function to unify examples

def unify_example(example, source):
    record = {
        'question': None,
        'options': [],
        'rationale': '',
        'label': None,
        'label_idx' : None,
        'dataset': source
    }
    if source == 'openbookqa':
        record['question'] = example['question_stem']
        texts = example['choices']['text']
        for text in texts:
            record['options'].append(text)
        record['label'] = example['answerKey']
        record['label_idx'] = ord(example['answerKey']) - ord('A')
        record['rationale'] = "Key fact:\n" + example['fact1']
    elif source == 'sciq':
        record['question'] = example['question']
        options = deque([
            example['correct_answer'],
            example['distractor1'],
            example['distractor2'],
            example['distractor3'],
        ])
        shift = random.randint(0, 3)
        options.rotate(shift)
        record['options'] = list(options)
        record['label'] = chr(ord('A') + shift)
        record['label_idx'] = shift
        record['rationale'] = "Supporting evidence:\n" + example['support']
    elif source == 'race':
        record['question'] = example['question']
        record['options'] = example['options']
        record['label'] = example['answer']
        record['label_idx'] = ord(example['answer']) - ord('A')
        record['rationale'] = "Article passage (for context):\n"  + example['article']
    elif source == 'mmlu_aux':
        record['question'] = example['question']
        record['options'] = example['choices']
        record['label'] = chr(ord('A') + example['answer'])
        record['label_idx'] = example['answer']
    elif source == 'aqua_rat':
        record['question'] = example['question']
        record['options'] = [opt[2:] for opt in example['options']]
        record['rationale'] = "Step-by-step solution:\n" + example.get('rationale', '')
        record['label'] = example['correct']
        record['label_idx'] = ord(example['correct']) - ord('A')
    elif source == 'medmcqa':
        record['question'] = example['question']
        record['options']  = [example['opa'], example['opb'], example['opc'], example['opd']]
        record['label']    = chr(ord('a') + example['cop'])
        record['label_idx'] = example['cop']
        record['rationale'] = f"Explanation:\n{example['exp']}" if example['exp'] is not None else ''
    return record

# 3. Process and unify all sampled datasets
unified_datasets = []
for name, ds in sampled_datasets.items():
    uni = ds.map(lambda ex: unify_example(ex, name), remove_columns=ds.column_names)
    unified_datasets.append(uni)
combined = concatenate_datasets(unified_datasets)
print(f"Total unified examples: {len(combined)}")

# 4b. Process and unify validation subsets
unified_val_datasets = []
for name, ds in sampled_val_datasets.items():
    uni = ds.map(lambda ex: unify_example(ex, name), remove_columns=ds.column_names)
    unified_val_datasets.append(uni)
val_combined = concatenate_datasets(unified_val_datasets)
print(f"Total unified val examples: {len(val_combined)}")

unified_test = []
for name, ds in sampled_test_datasets.items():
    uni = ds.map(lambda ex: unify_example(ex, name), remove_columns=ds.column_names)
    unified_test.append(uni)
test_combined = concatenate_datasets(unified_test)
print(f"Total unified test examples: {len(test_combined)}")

# 6. Push du dataset MULTI-COLONNES sur le Hub
from datasets import DatasetDict
ds = DatasetDict({
    "train":      combined,
    "validation": val_combined,
    "test":       test_combined
})

ds.push_to_hub(REPO_ID)  # pushes all splits together
print(f"Pushed train + validation + test splits to {REPO_ID}")
print(f"Dataset successfully pushed to {REPO_ID}")

openbookqa: selected 4900 of 4957
sciq: selected 10000 of 11679
race: selected 50000 of 87866
mmlu_aux: selected 85100 of 99842
aqua_rat: selected 50000 of 83671
medmcqa: selected 50000 of 182822
openbookqa val: selected 13 of 500
sciq val: selected 27 of 1000
race val: selected 136 of 4887
mmlu_aux val: selected 231 of 1531
aqua_rat val: selected 136 of 228
medmcqa val: selected 136 of 4183
Total unified examples: 250000
Total unified val examples: 679


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Total unified test examples: 654


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/250 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/585 [00:00<?, ?B/s]

Pushed train + validation + test splits to NicoHelemon/MNLP_M3_mcqa_dataset
Dataset successfully pushed to NicoHelemon/MNLP_M3_mcqa_dataset


In [3]:
# Script to create aqua_rat_4 by filtering out examples where E is correct and removing option E
# Replace with your Hugging Face username and token

HF_USERNAME = "NicoHelemon"
REPO_ID = f"{HF_USERNAME}/aqua_rat_4"

import os
from datasets import load_dataset, DatasetDict

# Set your Hugging Face token as an environment variable
os.environ["HF_TOKEN"] = "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"

# 1. Load the full deepmind/aqua_rat dataset (train + validation)
datasets = load_dataset("deepmind/aqua_rat")

# 2. Filter out any example where the correct answer is 'E'
def is_not_e(example):
    return example['correct'] != 'E'

datasets_filtered = datasets.filter(is_not_e)

# 3. Remove the last option (E) from the options list for each remaining example
def drop_last_option(example):
    # Keep only options A-D
    example['options'] = example['options'][:-1]
    return example

datasets_cleaned = datasets_filtered.map(drop_last_option)

# 4. Push the new dataset to the Hub
datasets_cleaned.push_to_hub(REPO_ID)

print(f"Dataset pushed to: {REPO_ID}")

Filter:   0%|          | 0/97467 [00:00<?, ? examples/s]

Filter:   0%|          | 0/254 [00:00<?, ? examples/s]

Filter:   0%|          | 0/254 [00:00<?, ? examples/s]

Map:   0%|          | 0/83671 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to: NicoHelemon/aqua_rat_4


In [4]:
# requirements:
#   pip install datasets huggingface_hub

import os
from datasets import load_dataset, DatasetDict, concatenate_datasets

# your HF credentials and target repo
HF_USERNAME = "NicoHelemon"
REPO_ID      = f"{HF_USERNAME}/mmlu_STEM"
os.environ["HF_TOKEN"] = "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"

# list of the STEM subsets you want to merge
stem_subsets = [
    "abstract_algebra",
    "elementary_mathematics",
    "high_school_mathematics",
    "college_mathematics",
    "high_school_statistics",
    "formal_logic",
    "anatomy",
    "astronomy",
    "college_biology",
    "high_school_biology",
    "college_chemistry",
    "high_school_chemistry",
    "conceptual_physics",
    "college_physics",
    "high_school_physics",
    "medical_genetics",
    "nutrition",
    "human_aging",
    "virology",
    "clinical_knowledge",
    "electrical_engineering",
    "college_computer_science",
    "high_school_computer_science",
    "computer_security",
    "machine_learning",
]

# the splits in the original mmlu
splits = ["test", "validation", "dev"]

# placeholder lists for each split
merged = { split: [] for split in splits }

# load each subject, tag it, and collect
for subject in stem_subsets:
    ds = load_dataset("cais/mmlu", subject)
    for split in splits:
        # add the subject column
        ds_tagged = ds[split].map(lambda ex: {"subject": subject})
        merged[split].append(ds_tagged)

# concatenate per split
final_ds = DatasetDict({
    split: concatenate_datasets(merged[split]) 
    for split in splits
})

# (optional) shuffle each split if you like:
# final_ds = final_ds.map(lambda x: x, shuffle=True)

# push to your HF Hub repo (will pick up HF_TOKEN from env)
final_ds.push_to_hub(REPO_ID, 
                     token=os.environ["HF_TOKEN"], 
                     private=False)  # or True if you want it private

print(f"✅ Dataset successfully pushed to https://huggingface.co/{REPO_ID}")

KeyError: 'train'