In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import pandas as pd
import os
import random
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [4]:
ds = load_dataset("Intel/orca_dpo_pairs", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,system,question,chosen,rejected
0,,You will be given a definition of a task first...,"[\n [""AFC Ajax (amateurs)"", ""has ground"", ""Sp...","Sure, I'd be happy to help! Here are the RDF ..."
1,You are an AI assistant. You will be given a t...,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,Sure! Here's a sentence that describes all th...
2,"You are a helpful assistant, who always provid...",What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...,"Ooh, let me think! *giggle* Okay, I know what..."
3,You are an AI assistant. You will be given a t...,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti...",Certainly! Here's a detailed and long answer ...
4,You are an AI assistant that helps people find...,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e...","Sure, I'd be happy to help! To calculate how ..."


In [5]:
ids = generate_unique_strings(len(df), 22)
print(f"Generated {len(ids)} Unique IDs.")

Generated 12859 Unique IDs.


In [6]:
df = df[["question", "chosen", "rejected"]]

prompts = []
response_as = []
response_bs = []
winners = []
model_as = []
model_bs = []
languages = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompts.append(row["question"])
    response_a = row["chosen"]
    response_b = row["rejected"]
    if random.random() < 0.5:
        response_a, response_b = response_b, response_a
        winners.append("model_b")
    else:
        winners.append("model_a")
    response_as.append(response_a)
    response_bs.append(response_b)

    model_as.append("Unknown")
    model_bs.append("Unknown")
    languages.append("English")

df["response_a"] = response_as
df["response_b"] = response_bs
df["prompt"] = prompts
df["winner"] = winners
df["id"] = ids
df["model_a"] = model_as
df["model_b"] = model_bs
df["language"] = languages

df.drop(columns=["question", "chosen", "rejected"], inplace=True)
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

100%|██████████| 12859/12859 [00:00<00:00, 15403.43it/s]

(12859, 8)





Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,xiflH2qS4SDbJvmg5FAfl4,You will be given a definition of a task first...,"Sure, I'd be happy to help! Here are the RDF ...","[\n [""AFC Ajax (amateurs)"", ""has ground"", ""Sp...",model_b,Unknown,Unknown,English
1,jOZsf5cxM7CadMhJFW9cSD,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,Sure! Here's a sentence that describes all th...,model_a,Unknown,Unknown,English
2,Lu15FKiQyzgk7dndRSClPP,What happens next in this paragraph?\n\nShe th...,"Ooh, let me think! *giggle* Okay, I know what...",C. She then dips the needle in ink and using t...,model_b,Unknown,Unknown,English
3,ytv7gSRABcHbdBZsHBaw48,Please answer the following question: I want t...,Certainly! Here's a detailed and long answer ...,"Based on the passage, discuss the primary moti...",model_b,Unknown,Unknown,English
4,8tcrDV8juDilmm7dywrnFD,James runs a TV show and there are 5 main char...,"Sure, I'd be happy to help! To calculate how ...","James pays the minor characters $15,000 each e...",model_b,Unknown,Unknown,English


In [7]:
df["winner"].value_counts()

winner
model_b    6461
model_a    6398
Name: count, dtype: int64

In [8]:
df.to_parquet("Intel-orca_dpo_pairs.parquet", index=False)