In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import pandas as pd
import os
import random
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [4]:
ds = load_dataset("argilla/dpo-mix-7k", token=hf_token)
df_train = Dataset.to_pandas(ds["train"])
df_test = Dataset.to_pandas(ds["test"])
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,dataset,chosen,rejected,chosen_rating,rejected_rating
0,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Question: how old julio cesar ...,[{'content': 'Q:Question: how old julio cesar ...,10.0,1.0
1,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Information: - The Assistant ...,[{'content': 'Q:Information: - The Assistant ...,10.0,9.0
2,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'If But slowly and doggedly he we...,[{'content': 'If But slowly and doggedly he we...,10.0,9.0
3,argilla/distilabel-capybara-dpo-7k-binarized,"[{'content': '""Cherry blossom festivals""', 'ro...","[{'content': '""Cherry blossom festivals""', 'ro...",4.0,2.0
4,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'You are given a list of queries ...,[{'content': 'You are given a list of queries ...,8.0,6.0


In [5]:
df = df[df["chosen_rating"] - df["rejected_rating"] >= 4.0]
print(df.shape)
df.head()

(1436, 5)


Unnamed: 0,dataset,chosen,rejected,chosen_rating,rejected_rating
0,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Question: how old julio cesar ...,[{'content': 'Q:Question: how old julio cesar ...,10.0,1.0
6,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Definition: Given a sentence in ...,[{'content': 'Definition: Given a sentence in ...,10.0,0.0
7,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'How does the sentence end? See o...,[{'content': 'How does the sentence end? See o...,10.0,1.0
14,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Suggest a movie title for the ...,[{'content': 'Q:Suggest a movie title for the ...,10.0,1.0
22,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Two brothers had bike repair sho...,[{'content': 'Two brothers had bike repair sho...,10.0,1.0


In [6]:
df = df[df["chosen"].apply(len) == 2]
print(df.shape)
df.head()

(1291, 5)


Unnamed: 0,dataset,chosen,rejected,chosen_rating,rejected_rating
0,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Question: how old julio cesar ...,[{'content': 'Q:Question: how old julio cesar ...,10.0,1.0
6,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Definition: Given a sentence in ...,[{'content': 'Definition: Given a sentence in ...,10.0,0.0
7,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'How does the sentence end? See o...,[{'content': 'How does the sentence end? See o...,10.0,1.0
14,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Q:Suggest a movie title for the ...,[{'content': 'Q:Suggest a movie title for the ...,10.0,1.0
22,argilla/distilabel-intel-orca-dpo-pairs,[{'content': 'Two brothers had bike repair sho...,[{'content': 'Two brothers had bike repair sho...,10.0,1.0


In [7]:
ids = generate_unique_strings(len(df), 21)
print(f"Generated {len(ids)} Unique IDs.")

Generated 1291 Unique IDs.


In [8]:
df = df[["chosen", "rejected"]]

prompts = []
response_as = []
response_bs = []
winners = []
model_as = []
model_bs = []
languages = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompt1 = row["chosen"][0]["content"]
    prompt2 = row["rejected"][0]["content"]
    if prompt1 != prompt2:
        print(f"Diffrent Prompts in Single Row: {idx}")
    else:
        prompts.append(prompt1)
    
    response_a = row["chosen"][1]["content"]
    response_b = row["rejected"][1]["content"]
    if random.random() < 0.5:
        response_a, response_b = response_b, response_a
        winners.append("model_b")
    else:
        winners.append("model_a")
    response_as.append(response_a)
    response_bs.append(response_b)

    model_as.append("Unknown")
    model_bs.append("Unknown")
    languages.append("English")

df["response_a"] = response_as
df["response_b"] = response_bs
df["prompt"] = prompts
df["winner"] = winners
df["id"] = ids
df["model_a"] = model_as
df["model_b"] = model_bs
df["language"] = languages

df.drop(columns=["chosen", "rejected"], inplace=True)
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

100%|██████████| 1291/1291 [00:00<00:00, 14600.43it/s]

(1291, 8)





Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,cXKrCdseUJ8va4PKf4QrT,Q:Question: how old julio cesar chavez when he...,"No, the answer provided does not directly answ...","Yes, the information you found on Google is c...",model_b,Unknown,Unknown,English
1,yEaDBnhKgKRZdOtZajBX9,"Definition: Given a sentence in German, provid...",一般的に葉は長さが1.5mmから4mmで、幅は0.2mmから0.7mmです。\n\nTo c...,Sure! Here's how I used the definition to com...,model_a,Unknown,Unknown,English
2,pwJXHWcTw8GcA3NxgtIQa,How does the sentence end? See options at the ...,"Sure, I'd be happy to help you with the task!...","To find out how the sentence ends, we need to ...",model_b,Unknown,Unknown,English
3,eeVFSlegHdvcUC1mYj1mu,Q:Suggest a movie title for the following movi...,"Based on the provided movie plot, here are so...","""Rings of Destiny: The Battle for Middle-earth""",model_a,Unknown,Unknown,English
4,amALFkWY1JHZZoDhenwjw,Two brothers had bike repair shops in neighbor...,"Sure, I'd be happy to help! Based on the info...",Bob's Town\nB: Rob's Town\n\nTo answer this qu...,model_b,Unknown,Unknown,English


In [9]:
df.to_parquet("argilla-dpo-mix-7k.parquet", index=False)