In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from pprint import pprint

# Data Split

In [3]:
ds = load_dataset("openbmb/UltraFeedback", split="train")
ds[0]

Downloading readme: 100%|██████████| 15.4k/15.4k [00:00<00:00, 26.3MB/s]
Downloading data: 100%|██████████| 168M/168M [00:04<00:00, 41.3MB/s]
Downloading data: 100%|██████████| 25.9M/25.9M [00:00<00:00, 34.7MB/s]
Downloading data: 100%|██████████| 240M/240M [00:06<00:00, 37.7MB/s]
Downloading data: 100%|██████████| 313M/313M [00:07<00:00, 40.1MB/s]
Downloading data: 100%|██████████| 9.99M/9.99M [00:00<00:00, 20.0MB/s]
Downloading data: 100%|██████████| 182M/182M [00:04<00:00, 39.7MB/s]
Downloading data files: 100%|██████████| 1/1 [00:31<00:00, 31.24s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 426.03it/s]
Generating train split: 63967 examples [00:03, 20536.10 examples/s]
  table = cls._concat_blocks(blocks, axis=0)


{'source': 'evol_instruct',
 'instruction': 'Can you write a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea? Here\'s some starter code to help you out:\n#include <iostream>\n#include <string>\nusing namespace std;\nint main() {\n    string country;\n    // prompt user for input\n    cout << "Enter the name of a country: ";\n    cin >> country;\n    // check if country borders the Mediterranean Sea\n    // [C++ code]\n    return 0;\n}',
 'models': ['alpaca-7b', 'pythia-12b', 'starchat', 'vicuna-33b'],
 'completions': [{'annotations': {'helpfulness': {'Rating': '2',
     'Rationale': 'The response is clear and not lengthy, but it lacks useful and comprehensive information.',
     'Rationale For Rating': 'The code is partially incorrect as it checks if the country name ends with "Mediterranean" instead of checking if it borders the Mediterranean Sea, which may cause confusion.',
     'Type': ['1', '3']},
    'honesty': {'Rat

In [8]:
dd = ds.train_test_split(test_size=1000, seed=42)

In [13]:
dd.push_to_hub("heegyu/UltraFeedback-split")

Creating parquet from Arrow format: 100%|██████████| 32/32 [00:05<00:00,  6.09ba/s]
Creating parquet from Arrow format: 100%|██████████| 32/32 [00:05<00:00,  6.10ba/s]s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:20<00:00, 10.13s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  6.05ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]


# Max-margin dataset

In [6]:
new_dd = DatasetDict()
new_dd["train"] = ds
for k in new_dd:
    ds = new_dd[k]
    items = []

    for item in ds:
        completions = sorted(item["completions"], key=lambda x: x["overall_score"], reverse=True)
        if len(completions) < 2:
            continue
        
        chosen, rejected = completions[0], completions[-1]

        items.append({
            "instruction": item["instruction"],
            "chosen": chosen["response"],
            "chosen_critique": chosen["critique"],
            "chosen_score": chosen["overall_score"],
            "rejected": rejected["response"],
            "rejected_critique": rejected["critique"],
            "rejected_score": rejected["overall_score"],
        })

    new_dd[k] = Dataset.from_list(items)

In [7]:
print(new_dd)
# pprint(new_dd['train'][0])
new_dd.push_to_hub("heegyu/Ultrafeedback-max-margin-critique")

DatasetDict({
    train: Dataset({
        features: ['instruction', 'chosen', 'chosen_critique', 'chosen_score', 'rejected', 'rejected_critique', 'rejected_score'],
        num_rows: 63966
    })
})


Creating parquet from Arrow format: 100%|██████████| 64/64 [00:01<00:00, 48.24ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:07<00:00,  7.22s/it]


# Every Pair

In [20]:
new_dd = DatasetDict()
for k in dd:
    ds = dd[k]
    items = []

    for item in ds:
        completions = sorted(item["completions"], key=lambda x: x["overall_score"], reverse=True)
        if len(completions) < 2:
            continue
        
        for i, chosen in enumerate(completions[:-1]):
            for rejected in completions[i + 1:]:
                if abs(chosen["overall_score"] - rejected["overall_score"]) < 1.0:
                    continue

                items.append({
                    "instruction": item["instruction"],
                    "chosen": chosen["response"],
                    "chosen_critique": chosen["critique"],
                    "chosen_score": chosen["overall_score"],
                    "rejected": rejected["response"],
                    "rejected_critique": rejected["critique"],
                    "rejected_score": rejected["overall_score"],
                })
            chosen, rejected = completions[0], completions[-1]

            items.append({
                "instruction": item["instruction"],
                "chosen": chosen["response"],
                "chosen_critique": chosen["critique"],
                "chosen_score": chosen["overall_score"],
                "rejected": rejected["response"],
                "rejected_critique": rejected["critique"],
                "rejected_score": rejected["overall_score"],
            })

    new_dd[k] = Dataset.from_list(items)

In [21]:
print(new_dd)
# pprint(new_dd['train'][0])
# new_dd.push_to_hub("heegyu/Ultrafeedback-split-dpo-max-margin")

DatasetDict({
    train: Dataset({
        features: ['instruction', 'chosen', 'chosen_critique', 'chosen_score', 'rejected', 'rejected_critique', 'rejected_score'],
        num_rows: 436176
    })
    test: Dataset({
        features: ['instruction', 'chosen', 'chosen_critique', 'chosen_score', 'rejected', 'rejected_critique', 'rejected_score'],
        num_rows: 6892
    })
})


# Critique 생성

In [10]:
new_dd = DatasetDict()
for k in dd:
    ds = dd[k]
    items = []

    for item in ds:
        for response in item["completions"]:
            items.append({
                "instruction": item["instruction"],
                "output": response["response"],
                "critique": response["critique"],
                "overall_score": response["overall_score"]
            })

    new_dd[k] = Dataset.from_list(items)

In [11]:
new_dd

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'critique', 'overall_score'],
        num_rows: 251864
    })
    test: Dataset({
        features: ['instruction', 'output', 'critique', 'overall_score'],
        num_rows: 4000
    })
})

In [12]:
new_dd.push_to_hub("heegyu/Ultrafeedback-split-critiques")

Creating parquet from Arrow format: 100%|██████████| 126/126 [00:00<00:00, 130.86ba/s]
Creating parquet from Arrow format: 100%|██████████| 126/126 [00:00<00:00, 131.69ba/s]t]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:13<00:00,  6.78s/it]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 122.60ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
