In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import yaml
import os

In [2]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [3]:
ds = load_dataset("lmsys/chatbot_arena_conversations", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,anony,language,tstamp,openai_moderation,toxic_chat_tag
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...",1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."


In [4]:
df = df[df["winner"] != "tie"]
df = df[df["winner"] != "tie (bothbad)"]
print(df.shape)

(23294, 13)


In [5]:
print(df["conversation_a"][0])
print(len(df["conversation_a"][0]))
print(df["conversation_a"][0][0]["content"])
print(df["conversation_a"][0][1]["content"])

[{'content': 'What is the difference between OpenCL and CUDA?', 'role': 'user'}
 {'content': 'OpenCL and CUDA are two different programming models that are used for parallel computing.OpenCL is a general-purpose并行编程接口 that allows developers to write parallel code that can run on any platform that supportsCL, which includes most modern operating systems and computer systems, including Windows, Linux, and macOS. It provides a lower-level, more flexible API that is more suitable for building large-scale distributed computing systems.CUDA is a specific implementation ofOpenCL that is designed for performance and scalability in devices with multiple GPU(s). It was developed by Nvidia and is widely used for scientific computing, machine learning, and图形图像处理. The CUDA API provides a high-level, more efficient way to write parallel code that can run on top of a GPU, while still maintaining the ability to write code that can be easily adapted for other parallel computing platforms.', 'role': 'as

In [6]:
df = df[df["conversation_a"].apply(len) == 2]
print(df.shape)

(20491, 13)


In [7]:
df = df[["question_id", "model_a", "model_b", "winner", "language", "conversation_a", "conversation_b"]]

prompts = []
response_as = []
response_bs = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    prompt1 = row["conversation_a"][0]["content"]
    prompt2 = row["conversation_b"][0]["content"]
    if prompt1 != prompt2:
        print(f"Diffrent Prompts in Single Row: {idx}")
    else:
        prompts.append(prompt1)

    response_a = row["conversation_a"][1]["content"]
    response_b = row["conversation_b"][1]["content"]
    response_as.append(response_a)
    response_bs.append(response_b)

df["response_a"] = response_as
df["response_b"] = response_bs
df["prompt"] = prompts

df.drop(columns=["conversation_a", "conversation_b"], inplace=True)
df.rename(columns={"question_id": "id"}, inplace=True)

df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.head()

100%|██████████| 20491/20491 [00:01<00:00, 13842.14it/s]


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,model_b,chatglm-6b,koala-13b,English
2,90bfd142157948aba01931726c888e7f,"Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,model_b,koala-13b,oasst-pythia-12b,English
3,a7c5accc53e649a3bc6b2e41d962ebc4,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,model_b,vicuna-13b,oasst-pythia-12b,English
4,adf27e819a3c494cb6e993f0c660e097,When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",model_a,vicuna-13b,koala-13b,English
5,c0fc42c6f5f14f2aa5a89f71f8553730,Count from 1 to 10 with step = 3,"1, 4, 7, 10\n\nCounting with a step of 3 means...","1, 4, 7, 10",model_a,vicuna-13b,koala-13b,English


In [8]:
df["winner"].value_counts()

winner
model_a    10343
model_b    10148
Name: count, dtype: int64

In [9]:
df["language"].value_counts()

language
English       18320
German          430
Spanish         344
French          285
Portuguese      178
              ...  
Hawaiian          1
Maori             1
Malagasy          1
Manx              1
Yoruba            1
Name: count, Length: 71, dtype: int64

In [10]:
df.to_parquet("lmsys-chatbot_arena_conversations.parquet", index=False)