In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import os
import json

In [2]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [3]:
ds = load_dataset("lmarena-ai/arena-human-preference-55k", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [4]:
df = df[df["winner_tie"] != 1]
df.shape

(39716, 9)

In [5]:
print(json.loads(df["prompt"][0])[0])
print(json.loads(df["prompt"][0])[1])

Is it morally right to try to have a certain percentage of females on managerial positions?
OK, does pineapple belong on a pizza? Relax and give me fun answer.


In [6]:
df["prompt"] = df["prompt"].apply(json.loads)
df["response_a"] = df["response_a"].apply(json.loads)
df["response_b"] = df["response_b"].apply(json.loads)

df = df[df["prompt"].apply(lambda x: len(x) == 1)]

In [7]:
df = df[["id", "prompt", "response_a", "response_b", "winner_model_a", "winner_model_b", "model_a", "model_b"]]
df.head()

Unnamed: 0,id,prompt,response_a,response_b,winner_model_a,winner_model_b,model_a,model_b
3,96401,[How can I create a test set for a very rare c...,[Creating a test set for a very rare category ...,[When building a classifier for a very rare ca...,1,0,llama-2-13b-chat,mistral-7b-instruct
4,198779,[What is the best way to travel from Tel-Aviv ...,[The best way to travel from Tel Aviv to Jerus...,[The best way to travel from Tel-Aviv to Jerus...,0,1,koala-13b,gpt-3.5-turbo-0314
5,292873,"[Construct a rap battle, in the style of Epic ...","[[Zeus]\nYo, it's the king of the gods on the ...","[(Verse 1 - Zeus)\n\nI'm the king of the gods,...",0,1,vicuna-13b,gpt-4-0314
6,313413,[Why water is not used in bath tub?],[Water is actually used in a bath tub. A bath ...,[Water is not used in a bath tub because it is...,1,0,mixtral-8x7b-instruct-v0.1,vicuna-13b
7,370945,"[""Bacteria is life on Mars but a heartbeat isn...",[Dune],[This quote seems to be referencing the debate...,0,1,gemini-pro,claude-2.0


In [8]:
prompts = []
response_as = []
response_bs = []
winners = []
languages = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompts.append(row["prompt"][0])
    response_as.append(row["response_a"][0])
    response_bs.append(row["response_b"][0])
    if row["winner_model_a"] == 1:
        winners.append("model_a")
    elif row["winner_model_b"] == 1:
        winners.append("model_b")
    languages.append("English")

df["prompt"] = prompts
df["response_a"] = response_as
df["response_b"] = response_bs
df["winner"] = winners
df["language"] = languages

df.drop(columns=["winner_model_a", "winner_model_b"], inplace=True)
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
print(df.shape)
df.head()

100%|██████████| 34297/34297 [00:02<00:00, 12638.28it/s]

(34297, 8)





Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
3,96401,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,model_a,llama-2-13b-chat,mistral-7b-instruct,English
4,198779,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,model_b,koala-13b,gpt-3.5-turbo-0314,English
5,292873,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ...",model_b,vicuna-13b,gpt-4-0314,English
6,313413,Why water is not used in bath tub?,Water is actually used in a bath tub. A bath t...,Water is not used in a bath tub because it is ...,model_a,mixtral-8x7b-instruct-v0.1,vicuna-13b,English
7,370945,"""Bacteria is life on Mars but a heartbeat isn'...",Dune,This quote seems to be referencing the debate ...,model_b,gemini-pro,claude-2.0,English


In [9]:
df["winner"].value_counts()

winner
model_a    17312
model_b    16985
Name: count, dtype: int64

In [10]:
df.to_parquet("lmarena-ai-arena-human-preference-55k.parquet", index=False)