In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import json
import re

In [None]:
taiwan = load_dataset("aigrant/tw_chatbot_arena", "all")["train"]
taiwan = Dataset.to_pandas(taiwan)
taiwan.head()

In [None]:
df = taiwan.copy()

In [None]:
df["winner_model_a"] = df["winner"].apply(lambda x: 1 if x == "model_a" else 0)
df["winner_model_b"] = df["winner"].apply(lambda x: 1 if x == "model_b" else 0)
df["winner_tie"] = df["winner"].apply(lambda x: 1 if x == "tie" or x == "tie (bothbad)" else 0)
df.head()

In [None]:
prompts = []
response_as = []
response_bs = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    conversation_a = row["conversation_a"]
    conversation_b = row["conversation_b"]

    prompt_a = []
    prompt_b = []
    response_a = []
    response_b = []

    if len(conversation_a) != len(conversation_b):
        print(f"Error: conversation_a and conversation_b have different lengths at index {idx}")
    else:
        for i in range(len(conversation_a)):
            single_round_a = conversation_a[i]
            single_round_b = conversation_b[i]

            if single_round_a["role"] == "user":
                prompt_a.append(single_round_a["content"])
            elif single_round_a["role"] == "assistant":
                response_a.append(single_round_a["content"])
            else:
                print(f"Error: unexpected role at index {idx}")

            if single_round_b["role"] == "user":
                prompt_b.append(single_round_b["content"])
            elif single_round_b["role"] == "assistant":
                response_b.append(single_round_b["content"])
            else:
                print(f"Error: unexpected role at index {idx}")

        if prompt_a != prompt_b:
            print(f"Error: prompt_a and prompt_b have different lengths at index {idx}")
        else:
            prompts.append(prompt_a)
            response_as.append(response_a)
            response_bs.append(response_b)

In [None]:
df["prompt"] = prompts
df["response_a"] = response_as
df["response_b"] = response_bs
df.head()

In [None]:
df.columns

In [None]:
df.drop(columns=["winner", "judge", "conversation_a", "conversation_b", "turn", "anony", "language", "tstamp"], axis=1, inplace=True)
df.rename(columns={"question_id": "id"}, inplace=True)
df.head()

In [None]:
taiwan = df.copy()
taiwan

In [None]:
type(taiwan["prompt"][0])

In [None]:
taiwan.to_json("taiwan_arena.json", orient="records")

In [None]:
taiwan.drop_duplicates(subset=["prompt"], inplace=True)
taiwan.shape

In [None]:
taiwan.to_json("taiwan_arena_no_prompt_duplicate.json", orient="records")

In [None]:
korean = load_dataset("mncai/ko-chatbot-arena")["train"]
korean = Dataset.to_pandas(korean)
korean.head()

In [None]:
df = korean.copy()

In [None]:
df["winner_model_a"] = df["winner"].apply(lambda x: 1 if x == "model_a" else 0)
df["winner_model_b"] = df["winner"].apply(lambda x: 1 if x == "model_b" else 0)
df["winner_tie"] = df["winner"].apply(lambda x: 1 if x == "tie" or x == "tie (bothbad)" else 0)
df.head()

In [None]:
df.drop([42], inplace=True)

In [None]:
prompts = []
response_as = []
response_bs = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    conversation_a = row["chats_a"]
    conversation_b = row["chats_b"]

    prompt_a = []
    prompt_b = []
    response_a = []
    response_b = []
    
    content_list_a = re.findall(r"content=(['\"])(.*?)\1", conversation_a)
    content_list_b = re.findall(r"content=(['\"])(.*?)\1", conversation_b)

    if len(content_list_a) != len(content_list_b):
        print(f"Error: conversation_a and conversation_b have different lengths at index {idx}")
    else:
        for i in range(len(content_list_a)):
            if i % 2 == 0:
                prompt_a.append(content_list_a[i][1])
                prompt_b.append(content_list_b[i][1])
            else:
                response_a.append(content_list_a[i][1])
                response_b.append(content_list_b[i][1])

        if prompt_a != prompt_b:
            print(f"Error: prompt_a and prompt_b have different lengths at index {idx}")
        else:
            prompts.append(prompt_a)
            response_as.append(response_a)
            response_bs.append(response_b)

In [None]:
df["prompt"] = prompts
df["response_a"] = response_as
df["response_b"] = response_bs
df.head()

In [None]:
df.columns

In [None]:
df.drop(['winner', 'turn', 'chats_a', 'chats_b', 'time_stamp'], axis=1, inplace=True)
df.rename(columns={"IP": "id"}, inplace=True)
df["id"] = range(len(df))
df.head()

In [None]:
korean = df.copy()
korean.to_json("korean_arena.json", orient="records")