In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import os
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [4]:
ds = load_dataset("RLHFlow/pair-preference-Skywork-80K-v0.1", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,messages
0,[{'content': '[CONTEXT] <turn> user  In tria...
1,[{'content': '[CONTEXT] <turn> user  I died ...
2,[{'content': '[CONTEXT] <turn> user  This pr...
3,[{'content': '[CONTEXT] <turn> user  ### Ove...
4,[{'content': '[CONTEXT] <turn> user  Perhaps...


In [5]:
df["messages"][0][0]

{'content': "[CONTEXT] \n\n<turn> user\n In triangle $ABC,$ $AB = 3,$ $AC = 6,$ and $\\cos \\angle A = \\frac{1}{8}.$  Find the length of angle bisector $\\overline{AD}.$\n [RESPONSE A] ## Step 1: To solve for the length of the angle bisector AD in triangle ABC, we can use the Angle Bisector Theorem.\nThe Angle Bisector Theorem states that in a triangle, an angle bisector divides the opposite side into segments that are proportional to the adjacent sides. However, to find the length of AD directly, we might need additional steps involving the cosine of angle A.\n\n## Step 2: Given that $\\cos \\angle A = \\frac{1}{8}$, we can apply the Law of Cosines to triangle ABC to find side BC or use a property involving the angle bisector and cosine to find AD.\nThe Law of Cosines states that in a triangle with sides of length a, b, and c opposite to angles A, B, and C, respectively, $c^2 = a^2 + b^2 - 2ab\\cos(C)$. However, directly finding BC doesn't help us find AD without further relationship

In [6]:
df["messages"][0][1]

{'content': 'A', 'role': 'assistant'}

In [7]:
df["messages"][1][0]

{'content': '[CONTEXT] \n\n<turn> user\n I died in 1992. This I\'m sure of. But I\'ve somehow awoken again in a land and time I\'m so foreign of. I have a strange itching and burning sensation deriving from the small pits of hell within me and it\'s mindless. I\'m effortlessly and aimlessly walking to wherever I seek the odd fulfillment of flesh, blood, a pulse, anything. I sniff out the living and slowly conquer them with my thoughtlessness power and I obtain my objective- human flesh. Why am I doing this? my hunger, it never even goes away it just dulls. And I can\'t really speak, I\'m muted and restricted to only grunts as I\'m trying to ask the others around me what happened. I see the humans running from me but I just want some answers. Why are they scared? What\'s happened to me? Why are the other monsters around me all so calm? On the inside I\'m panicking but all I can do is mumble a serious of grunts and breaths as I try to find answers. Hell is empty. We walk the Earth and He

In [8]:
ids = generate_unique_strings(len(df), 50)
print(f"Generated {len(ids)} Unique IDs.")

Generated 81959 Unique IDs.


In [9]:
prompt_start = "[CONTEXT] \n\n<turn> user\n"
response_A_start = "\n [RESPONSE A]"
response_B_start = "[RESPONSE B]"

In [10]:
prompts = []
response_as = []
response_bs = []
winners = []
model_as = []
model_bs = []
languages = []


for idx, row in tqdm(df.iterrows(), total=len(df)):
    input = row["messages"][0]["content"]
    try:
        start_index = input.find(prompt_start)
        end_index = input.find(response_A_start, start_index + len(prompt_start))
        if start_index != -1 and end_index != -1:
            prompt = input[start_index + len(prompt_start): end_index].strip()
        else:
            print(f"Error row {idx}")
            continue

        start_index = input.find(response_A_start)
        end_index = input.find(response_B_start, start_index + len(response_A_start))
        if start_index != -1 and end_index != -1:
            response_a = input[start_index + len(response_A_start): end_index].strip()
        else:
            print(f"Error row {idx}")
            continue

        start_index = input.find(response_B_start)
        if start_index != -1:
            response_b = input[start_index + len(response_B_start):].strip()
        else:
            print(f"Error row {idx}")
            continue

        winner = "model_a" if row["messages"][1]["content"] == "A" else "model_b"
        language = "English"

        prompts.append(prompt)
        response_as.append(response_a)
        response_bs.append(response_b)
        winners.append(winner)
        model_as.append("Unknown")
        model_bs.append("Unknown")
        languages.append(language)

    except:
        print(f"Error row {idx}")

100%|██████████| 81959/81959 [00:06<00:00, 11764.95it/s]


In [11]:
df["response_a"] = response_as
df["response_b"] = response_bs
df["prompt"] = prompts
df["winner"] = winners
df["id"] = ids
df["model_a"] = model_as
df["model_b"] = model_bs
df["language"] = languages

df.drop(columns=["messages"], inplace=True)
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

(81959, 8)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,jxvVlc8FY9pXR0uA4QgfJ5RAGNsiW28OMIbupTpCD3QAnj...,"In triangle $ABC,$ $AB = 3,$ $AC = 6,$ and $\c...",## Step 1: To solve for the length of the angl...,If $AD$ is an angle bisector then \n${\sqrt{3}...,model_a,Unknown,Unknown,English
1,JCVKF8jouUph0EtVp2SDMXmaePjwd11nBi1EDNdAih4Ix0...,I died in 1992. This I'm sure of. But I've som...,"You are experiencing what is known as ""The Hun...",This is a short story about a character who ha...,model_b,Unknown,Unknown,English
2,jJyKlLEciPQirm0942nsIWsiVy9PpEMj1vTW9ZPG8TV18j...,This problem is from the coding challenge on e...,"Your solution is correct, but the more elegant...","Your solution is already quite good, but I agr...",model_a,Unknown,Unknown,English
3,u2VRlijE7T77RllHjJ9xRxeOpigV6ST2WrTuW4CpkUoNsr...,### Overview\nThe goal of this project is to c...,I can help you implement the game logic and ga...,Here's a possible implementation of the game:\...,model_a,Unknown,Unknown,English
4,Jg39bEVCEFfQZjZaOTgrvxhuSAool4bkZfutVZUIQ7fYK8...,Perhaps you can offer some insight into this. ...,The COVID-19 pandemic has given us a glimpse i...,"If the entire world was in pandemic mode, but ...",model_b,Unknown,Unknown,English


In [12]:
df["winner"].value_counts()

winner
model_b    41952
model_a    40007
Name: count, dtype: int64

In [13]:
df.to_parquet("pair-preference-Skywork-80K-v0.1.parquet", index=False)