# Datasets preprocess
Process various types of datasets into a unified format, sample and store them, and add corresponding tags. 

Format:
QA questions
```json
{
    "problem": "xxx",
    "solution": "xxx",
    "tags": [
        "xxx/xxx"
    ]
}
```
instruction following
```json
{
    "prompt": "xxx",
    "eval_instruction": "xxx",
    "tags": [
        "xxx/xxx"
    ]
}
```
coding
```json
{
    "problem": "xxx",
    "test_case": "xxx",
    "dataset": "xxx",
    "tags": [
        "xxx/xxx"
    ]
}
```


In [1]:
import json
import os
import random
import csv
import re

In [3]:
# Recursive function to load json files from a path and its subdirectories
def load_from_json_recursive(path):
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), 'r') as f:
                    data.append(json.load(f))
    return data

def load_from_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data


def load_from_csv_recursive(path):
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                with open(os.path.join(root, file), 'r') as f:
                    csv_reader = csv.reader(f)
                    for row in csv_reader:
                        data.append(row)
    return data

## Reasoning

### Math

MATH

In [6]:
sample_size = 100
output_dir = '../benchmark/public/reasoning/math'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/MATH'
tags = ["reasoning/math"]
# Read MATH dataset
data = load_from_json_recursive(data_dir)
# Initial filter by level of difficulty
hard_data = []
for data_item in data:
    if data_item["level"] in ["Level 5"]:
        hard_data.append(data_item)

sampled_data = random.sample(hard_data, sample_size)

for i, item in enumerate(sampled_data):
    formatted_data = {
        "problem": item["problem"],
        "solution": item["solution"],
        "tags": tags
    }
    
    file_name = f'math_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

GSM8K

In [7]:
sample_size = 50
output_dir = '../benchmark/public/reasoning/math'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/GSM8K'
tags = ["reasoning/math"]

test_data = load_from_jsonl(os.path.join(data_dir, 'test.jsonl'))
train_data = load_from_jsonl(os.path.join(data_dir, 'train.jsonl'))

combined_data = test_data + train_data

sampled_data = random.sample(combined_data, sample_size)

for i, item in enumerate(sampled_data):
    formatted_data = {
        "problem": item["question"],
        "solution": item["answer"],
        "tags": tags
    }
    
    file_name = f'gsm8k_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

### Coding

HumanEval

In [9]:
sample_size = 50
output_dir = '../benchmark/public/reasoning/coding'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/HumanEval'
tags = ["reasoning/coding"]

data = load_from_jsonl(os.path.join(data_dir, 'human-eval-v2-20210705.jsonl'))

sampled_data = random.sample(data, sample_size)

for i, item in enumerate(sampled_data):
    entry_point = item["entry_point"]

    ground_truth = item["prompt"] + "\n" + item["canonical_solution"] + "\n"
    formatted_data = {
        "problem": item['prompt'],
        "test_case": item["test"] + "\n" + f"check({item['entry_point']})",
        "dataset": "humaneval",
        "tags": tags
    }
    
    file_name = f'humaneval_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

MBPP

In [10]:
sample_size = 50
output_dir = '../benchmark/public/reasoning/coding'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/MBPP'
tags = ["reasoning/coding"]

data = load_from_jsonl(os.path.join(data_dir, 'mbpp.jsonl'))

sampled_data = random.sample(data, sample_size)

for i, item in enumerate(sampled_data):
    test_code = '\n'.join(item["test_list"])

    input_data = f"{item['text']}\n\nYour code should pass these tests:\n\n{test_code}\n\n"
    
    ground_truth = item["code"] 
    formatted_data = {
        "problem": input_data,
        "test_case": test_code,
        "dataset": "mbpp",
        "tags": tags
    }
    
    file_name = f'mbpp_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

APPS

In [17]:
sample_size = 100
output_dir = '../benchmark/public/reasoning/coding'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/APPS'
tags = ["reasoning/coding"]

data = load_from_jsonl(os.path.join(data_dir, 'test.jsonl'))
hard_data = []
for data_item in data:
    if data_item["difficulty"] in ["competition"]: # introductory, interview and competition
        hard_data.append(data_item)

sampled_data = random.sample(hard_data, sample_size)

for i, item in enumerate(sampled_data):
    
    formatted_data = {
        "problem": item["question"],
        "test_case": item["input_output"],
        "dataset": "apps",
        "tags": tags
    }
    
    file_name = f'apps_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

### Planning

LLM-Plan

In [35]:
sample_size = 100
output_dir = '../benchmark/public/reasoning/planning'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/LLM-Plan'
tags = ["reasoning/planning"]

instances = load_from_json_recursive(data_dir)
data = []

for item in instances:
    for instance in item["instances"]:
        data.append(instance)


sampled_data = random.sample(data, sample_size)

for i, item in enumerate(sampled_data):
    formatted_data = {
        "problem": item["query"],
        "solution": item["ground_truth_plan"],
        "tags": tags
    }
    
    file_name = f'planning_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

### Commonsense

BBH

In [36]:
sample_size = 100
output_dir = '../benchmark/public/reasoning/commonsense'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/BBH'
tags = ["reasoning/commonsense"]

examples = load_from_json_recursive(data_dir)
data = []

for item in examples:
    for instance in item["examples"]:
        data.append(instance)


sampled_data = random.sample(data, sample_size)

for i, item in enumerate(sampled_data):
    formatted_data = {
        "problem": item["input"],
        "solution": item["target"],
        "tags": tags
    }
    
    file_name = f'bbh_{i}.json'
    
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

## Knowledge

### World_Knowledge (Offline)

MMLU

In [39]:
sample_size = 100
output_dir = '../benchmark/public/knowledge/world_knowledge'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/MMLU'
tags = ["knowledge/world_knowledge"]

data = load_from_csv_recursive(data_dir)

sampled_data = random.sample(data, sample_size)

for i, item in enumerate(sampled_data):
    if len(item) >= 6:
        question = item[0]
        choices = ['A. ' + item[1], 'B. ' + item[2], 'C. ' + item[3], 'D. ' + item[4]]
        answer = item[5]
        input_str = question + "\n" + "\n".join(choices) + "\nAmong A through D, the answer is"
        formatted_data = {
            "problem": input_str,
            "solution": answer,
            "tags": tags
        }
    
        file_name = f'mmlu_{i}.json'
        
        output_path = os.path.join(output_dir, file_name)
        with open(output_path, 'w', encoding='utf-8') as outfile:
            json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)

### Domain_Specific_Knowledge (Offline)

AGIEval

In [5]:
sample_size = 100
output_dir = '../benchmark/public/knowledge/domain_specific_knowledge'
os.makedirs(output_dir, exist_ok=True)
data_dir = '../benchmark/raw/AGIEval'
tags = ["knowledge/domain_specific_knowledge"]

english_qa_datasets = ["lsat-ar", "lsat-lr", "lsat-rc", "logiqa-en", "sat-math", "sat-en", "aqua-rat", "sat-en-without-passage", "gaokao-english"]
chinese_qa_datasets = ["logiqa-zh", "jec-qa-kd", "jec-qa-ca", "gaokao-chinese", "gaokao-geography", "gaokao-history", "gaokao-biology", "gaokao-chemistry", "gaokao-physics", "gaokao-mathqa"]
english_cloze_datasets = ["math"]
chinese_cloze_datasets = ["gaokao-mathcloze"]

multi_choice_datasets = ["jec-qa-kd", "jec-qa-ca", "gaokao-physics"]
math_output_datasets = {"gaokao-mathcloze", "math"}

data = []

for file in os.listdir(data_dir):
    if file.endswith(".jsonl"):
        filename, _ = os.path.splitext(file)
        data_one = load_from_jsonl(os.path.join(data_dir, file))
        for i, item in enumerate(data_one):
            passage = item["passage"] if item.get("passage") is not None else ""
            if filename in english_qa_datasets:
                option_string = "ABCDEFG"
                count = len(item.get("options", []))
                count = 5 if count == 1 else count
                problem = passage + "Q: " + item["question"] + " " + "Answer Choices: " + " ".join(item.get("options", [])) + "\n" + \
                       "A: Among A through {}, the answer is".format(option_string[count - 1])
                solution = item["label"]

            elif filename in chinese_qa_datasets:
                option_string = "ABCDEFG"
                count = len(item.get("options", []))
                count = 4 if count == 1 else count
                problem = passage + "问题：" + item["question"] + " " + "选项：" + " ".join(item.get("options", [])) + "\n" + \
                       "答案：从A到{}, 我们应选择".format(option_string[count - 1])
                solution = item["label"]

            elif filename in english_cloze_datasets:
                problem = passage + "Q: " + item["question"] + "\n" "A: The answer is"
                solution = item["answer"]

            elif filename in chinese_cloze_datasets:
                problem = passage + "问题：" + item["question"] + "\n" "答案："
                solution = item["answer"]
            formatted_data = {
                "problem": problem,
                "solution": solution,
                "tags": tags
            }
            data.append(formatted_data)

sampled_data = random.sample(data, sample_size)
for i, item in enumerate(sampled_data):
    file_name = f'agieval_{i}.json'

    output_path = os.path.join(output_dir, file_name)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(item, outfile, indent=4, ensure_ascii=False)
