In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import os
import random
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [4]:
ds = load_dataset("mlabonne/orpo-dpo-mix-40k", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,source,chosen,rejected,prompt,question
0,Airoboros,"[{'content': 'The setting is an otherworldly, ...","[{'content': 'The setting is an otherworldly, ...","The setting is an otherworldly, yet eerily fam...","The setting is an otherworldly, yet eerily fam..."
1,EverythingLM,[{'content': 'How many colors are traditionall...,[{'content': 'How many colors are traditionall...,How many colors are traditionally recognized i...,How many colors are traditionally recognized i...
2,EverythingLM,"[{'content': 'In a basket, there are 20 orange...","[{'content': 'In a basket, there are 20 orange...","In a basket, there are 20 oranges, 60 apples, ...","In a basket, there are 20 oranges, 60 apples, ..."
3,Airoboros,[{'content': 'Which famous physicist developed...,[{'content': 'Which famous physicist developed...,Which famous physicist developed the theory of...,Which famous physicist developed the theory of...
4,GOAT,[{'content': 'Find 40 * 865. Exclude words; sh...,[{'content': 'Find 40 * 865. Exclude words; sh...,Find 40 * 865. Exclude words; show only the math.,Find 40 * 865. Exclude words; show only the math.


In [5]:
df["chosen"][0][0], len(df["chosen"][0])

({'content': 'The setting is an otherworldly, yet eerily familiar, metropolis known as "Zephyria." It\'s a city suspended in the ether, floating amidst nebulous clouds of cosmic dust. The architecture here is surreal and alien, with buildings that twist and spiral like strands of DNA, reaching towards the cosmos. Streets are paved with luminescent cobblestones, casting soft hues of blues and purples, reflecting off iridescent structures. Strange vegetation, vibrant and bioluminescent, thrives everywhere, creating a stark contrast against the deep indigo sky.\n\nNow, immerse me in this ethereal journey through Zephyria.',
  'role': 'user'},
 6)

In [6]:
df = df[df["chosen"].apply(len) == 2]
print(df.shape)
df.head()

(39758, 5)


Unnamed: 0,source,chosen,rejected,prompt,question
4,GOAT,[{'content': 'Find 40 * 865. Exclude words; sh...,[{'content': 'Find 40 * 865. Exclude words; sh...,Find 40 * 865. Exclude words; show only the math.,Find 40 * 865. Exclude words; show only the math.
9,TheoremQA,[{'content': 'Are groups Z_4 * Z_2 and D_4 iso...,[{'content': 'Are groups Z_4 * Z_2 and D_4 iso...,Are groups Z_4 * Z_2 and D_4 isomorphic?,Are groups Z_4 * Z_2 and D_4 isomorphic?
10,GOAT,[{'content': '8824 * 1334? Only respond with m...,[{'content': '8824 * 1334? Only respond with m...,8824 * 1334?\nOnly respond with math and no wo...,8824 * 1334?\nOnly respond with math and no wo...
16,GOAT,[{'content': 'Assist me in calculating 9319357...,[{'content': 'Assist me in calculating 9319357...,Assist me in calculating 9319357631 plus 595. ...,Assist me in calculating 9319357631 plus 595. ...
17,TheoremQA,[{'content': 'A curve with a 120 m radius on a...,[{'content': 'A curve with a 120 m radius on a...,A curve with a 120 m radius on a level road is...,A curve with a 120 m radius on a level road is...


In [7]:
ids = generate_unique_strings(len(df), 20)
print(f"Generated {len(ids)} Unique IDs.")

Generated 39758 Unique IDs.


In [8]:
df = df[["chosen", "rejected", "prompt"]]

prompts = []
response_as = []
response_bs = []
winners = []
model_as = []
model_bs = []
languages = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompts.append(row["prompt"])
    response_a = row["chosen"][1]["content"]
    response_b = row["rejected"][1]["content"]
    if random.random() < 0.5:
        response_a, response_b = response_b, response_a
        winners.append("model_b")
    else:
        winners.append("model_a")
    response_as.append(response_a)
    response_bs.append(response_b)

    model_as.append("Unknown")
    model_bs.append("Unknown")
    languages.append("English")

df["response_a"] = response_as
df["response_b"] = response_bs
df["prompt"] = prompts
df["winner"] = winners
df["id"] = ids
df["model_a"] = model_as
df["model_b"] = model_bs
df["language"] = languages

df.drop(columns=["chosen", "rejected"], inplace=True)
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

100%|██████████| 39758/39758 [00:02<00:00, 13660.54it/s]

(39758, 8)





Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,LCUdcjmse8ilKETCdmEQ,Find 40 * 865. Exclude words; show only the math.,"34,600\n\n40 x 865 = 34,600 \n\nNote: The resp...",40 * 865 = 34600,model_b,Unknown,Unknown,English
1,GrGmrY4A9jxeM5XKk67r,Are groups Z_4 * Z_2 and D_4 isomorphic?,"To determine if two groups are isomorphic, we...","No, groups $Z_4 \times Z_2$ and $D_4$ are not ...",model_a,Unknown,Unknown,English
2,1e5loqnfIzmumPKd6AP6,8824 * 1334?\nOnly respond with math and no wo...,11959396\n,11752411964\n \n(8824 X 1334 = 11752411964)\n...,model_a,Unknown,Unknown,English
3,xLedm02YZJ3WUDLEpjiY,Assist me in calculating 9319357631 plus 595. ...,The result of adding 9319357631 and 595 is 931...,The sum of 9319357631 and 595 is 9319358226.,model_b,Unknown,Unknown,English
4,cvlpeKs8A5yH0xBiWaWb,A curve with a 120 m radius on a level road is...,"Newton's Laws of Motion, also known as Newton'...",To determine the minimum coefficient of static...,model_b,Unknown,Unknown,English


In [9]:
df.to_parquet("mlabonne-orpo-dpo-mix-40k.parquet", index=False)