In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter

from datasets import load_dataset
import json

In [None]:
# first, download and unzip the data per the instructions in Personalized Soups git repo [https://github.com/joeljang/RLPHF]
psoups_data_path = "your/downloaded/original/psoups/data/path"

In [3]:
# use the data for reward model training by psoups, the json files are under rm_training/ in the unzipped data
# preference prompts corresponding to preference profiles P1A, P1B, P2A, P2B, P3A, P3B
preference_prompts = {
    "P1A": "Generate a response that can be easily understood by an elementary school student.",
    "P1B": "Generate a response that only a PhD Student in that specific field could understand.",
    "P2A": "Generate a response that is concise and to the point without being verbose.",
    "P2B": "Generate a response that is very informative without missing any background information.",
    "P3A": "Generate a response that is friendly, witty, funny, and humorous, like a close friend.",
    "P3B": "Generate a response in an unfriendly manner.",
}

In [None]:
preference_datasets = {}
for preference_profile in preference_prompts.keys():
    data_path = f"{psoups_data_path}/rm_training/{preference_profile}.json"
    preference_datasets[preference_profile] = load_dataset("json", data_files=data_path, split="train")  # only has train split

# number of rows in each json file
for preference_profile in preference_datasets.keys():
    print(f"{preference_profile}: {len(preference_datasets[preference_profile]['user_input'])}")

P1A: 48300
P1B: 48825
P2A: 47350
P2B: 49220
P3A: 49155
P3B: 48565


In [6]:
# remove duplicate comparisons for each preference profile
dataset_wo_duplicates = []

# "P1A", "P1B", "P2A", "P2B", "P3A", "P3B" correspond to user ids 1, 2, 3, 4, 5, 6
for i, preference_profile in enumerate(["P1A", "P1B", "P2A", "P2B", "P3A", "P3B"]):
    counter = Counter()
    dataset = preference_datasets[preference_profile]

    for question, response_j, response_k in \
        zip(dataset["user_input"], dataset["completion_a"], dataset["completion_b"]):
        if response_j == response_k:
            continue
    
        text = f"<|user|>\n{question} \n<|response j|>\n {response_j} \n<|response k|>\n {response_k}"
        if text not in counter:
            dataset_wo_duplicates.append({
                "user_id": i+1,  # user id starts from 1
                "user_input": question.replace(preference_prompts[preference_profile], "").strip(),  # remove preference prompt from the input
                "completion_a": response_j,
                "completion_b": response_k,
            })
        
        counter[text] += 1
    
    print(f"{preference_profile} comparisons #: {len(counter)}")

print(f"total comparisons #: {len(dataset_wo_duplicates)}")

P1A comparisons #: 8959
P1B comparisons #: 9069
P2A comparisons #: 8239
P2B comparisons #: 8626
P3A comparisons #: 9356
P3B comparisons #: 9222
total comparisons #: 53471


In [7]:
user_id, comparison_cnt = np.unique(
    [comparison["user_id"] for comparison in dataset_wo_duplicates], return_counts=True)

for i, cnt in zip(user_id, comparison_cnt):
    print(f"User {i} comparisons #: {cnt}")

User 1 comparisons #: 8959
User 2 comparisons #: 9069
User 3 comparisons #: 8239
User 4 comparisons #: 8626
User 5 comparisons #: 9356
User 6 comparisons #: 9222


In [8]:
with open(f"./allcombo_8_cleaned.json", "w") as f:
    json.dump(dataset_wo_duplicates, f)

In [9]:
# the 50 prompts in koala_eval_50.json will be used for evaluation
# create a copy under /data for future evaluation
shutil.copy(os.path.join(psoups_data_path, "koala_eval_50.json"), "./koala_eval_50.json")

'./koala_eval_50.json'