In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from datasets import load_dataset

# MMLU-Pro 데이터셋 로드
dataset = load_dataset("TIGER-Lab/MMLU-Pro")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/45.3k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/12032 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/70 [00:00<?, ? examples/s]

In [3]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src'],
        num_rows: 12032
    })
    validation: Dataset({
        features: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src'],
        num_rows: 70
    })
})


In [4]:
from datasets import load_dataset

# MMLU-Pro 데이터셋 로드
dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")

# 데이터셋의 구조 확인
print("데이터셋 예제:", dataset[0])
print("데이터셋 열 이름:", dataset.column_names)

데이터셋 예제: {'question_id': 70, 'question': 'Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.', 'options': ['Safe practices, Fear, Jealousy, Trivial', 'Unsafe practices, Distress, Joy, Trivial', 'Safe practices, Wants, Jealousy, Trivial', 'Safe practices, Distress, Fear, Trivial', 'Unsafe practices, Wants, Jealousy, Serious', 'Safe practices, Distress, Jealousy, Serious', 'Safe practices, Wants, Fear, Serious', 'Unsafe practices, Wants, Fear, Trivial', 'Unsafe practices, Distress, Fear, Serious'], 'answer': 'I', 'answer_index': 8, 'cot_content': '', 'category': 'business', 'src': 'ori_mmlu-business_ethics'}
데이터셋 열 이름: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src']


In [8]:
import pandas as pd
from datasets import load_dataset

# 1. 데이터셋 로드
dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")

# 2. 데이터프레임 변환
df = pd.DataFrame(dataset)

# 3. 필요한 열 선택 및 결합
# "question"과 "options"를 결합하여 prompts 생성, "answer"를 ANSWER로 사용
if "question" in df.columns and "options" in df.columns and "answer_index" in df.columns:
    # 원하는 도메인 필터링
    target_domains = ["law", "psychology", "business", "philosophy", "history"]
    filtered_df = df[df['category'].str.lower().isin(target_domains)]

    # QUESTION 생성
    filtered_df["QUESTION"] = filtered_df.apply(
        lambda row: f"QUESTION{row['question_id']}) {row['question']}\n" +
                    "\n".join([f"({chr(65 + i)}) {opt}" for i, opt in enumerate(row["options"])]),
        axis=1
    )

    # ANSWER 값을 (A), (B), ... 형식으로 변환
    filtered_df["answer"] = filtered_df["answer_index"].apply(lambda idx: f"({chr(65 + idx)})")

    # 도메인별 균등 샘플링
    sampled_dfs = []
    for domain in target_domains:
        domain_df = filtered_df[filtered_df['category'].str.lower() == domain]
        sampled_dfs.append(domain_df.sample(n=2, random_state=42))  # 각 도메인에서 2개씩 샘플링
    balanced_df = pd.concat(sampled_dfs)

    # 두 개의 버전을 위한 데이터프레임 생성
    final_df_basic = balanced_df[["QUESTION", "answer"]]
    final_df_basic.columns = ["QUESTION", "ANSWER"]  # 열 이름을 첨부 파일 양식에 맞게 변경

    final_df_with_category = balanced_df[["QUESTION", "answer", "category"]]
    final_df_with_category.columns = ["QUESTION", "ANSWER", "CATEGORY"]  # 열 이름을 추가적으로 변경
else:
    raise Exception("데이터셋에 'question', 'options', 또는 'answer_index' 열이 없습니다.")

# 4. CSV 파일 저장
output_path_basic = "mmlu_test_samples.csv"
output_path_with_category = "mmlu_test_samples_with_category.csv"

final_df_basic.to_csv(output_path_basic, index=False, encoding="utf-8-sig")
final_df_with_category.to_csv(output_path_with_category, index=False, encoding="utf-8-sig")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["QUESTION"] = filtered_df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["answer"] = filtered_df["answer_index"].apply(lambda idx: f"({chr(65 + idx)})")
