In [1]:
import pandas as pd
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils import get_response_openai

In [None]:
print(os.cpu_count())

In [3]:
train = pd.read_parquet("../original_data/train.parquet")

In [None]:
train["language"].value_counts()[:20]

In [5]:
language_counts = train["language"].value_counts().to_dict()

with open("language_counts.json", "w", encoding="utf-8") as f:
    json.dump(language_counts, f, ensure_ascii=False, indent=4)

In [None]:
lmarena_ai_arena_human_preference_55k = pd.read_parquet("../external_data/lmarena-ai-arena-human-preference-55k.parquet")
print(lmarena_ai_arena_human_preference_55k.shape)

In [None]:
def word_count(text):
    return len(text.split()) if text else 0

lmarena_ai_arena_human_preference_55k["prompt_word_count"] = lmarena_ai_arena_human_preference_55k["prompt"].apply(word_count)
lmarena_ai_arena_human_preference_55k["response_a_word_count"] = lmarena_ai_arena_human_preference_55k["response_a"].apply(word_count)
lmarena_ai_arena_human_preference_55k["response_b_word_count"] = lmarena_ai_arena_human_preference_55k["response_b"].apply(word_count)

lmarena_ai_arena_human_preference_55k = lmarena_ai_arena_human_preference_55k[(lmarena_ai_arena_human_preference_55k["prompt_word_count"] <= 512) & (lmarena_ai_arena_human_preference_55k["response_a_word_count"] <= 512) & (lmarena_ai_arena_human_preference_55k["response_b_word_count"] <= 512)]
lmarena_ai_arena_human_preference_55k = lmarena_ai_arena_human_preference_55k.drop(columns=["prompt_word_count", "response_a_word_count", "response_b_word_count"])
lmarena_ai_arena_human_preference_55k = lmarena_ai_arena_human_preference_55k.dropna()
lmarena_ai_arena_human_preference_55k.reset_index(drop=True, inplace=True)

print(lmarena_ai_arena_human_preference_55k.shape)

In [None]:
lmarena_ai_arena_human_preference_55k.head()

In [None]:
languages = []

for language, count in language_counts.items():
    if language != "English" and language != "unknown" and language != "xx" and language != "zzp" and count < 200:
        languages.append(language)

print(len(languages))

In [None]:
lmarena_ai_arena_human_preference_55k.head()

In [None]:
group_id = 0

def generate_integers(id):
    start = id * os.cpu_count()
    end = start + os.cpu_count() - 1
    if end > len(languages):
        end = len(languages) - 1
    return list(range(start, end + 1))

group_index = generate_integers(group_id)
print(group_index)

In [12]:
num_each_language = 1

In [13]:
translation_aug_lmarena_ai_arena_human_preference_55k = pd.DataFrame(columns=lmarena_ai_arena_human_preference_55k.columns)


def translate_row(row, target_language):
    try:
        translated_prompt = get_response_openai(model="gpt-4o-mini", prompt=f"Translate the following text to {target_language}:\n{row['prompt']}\nOnly return your translation, do not include any other text.")
        translated_response_a = get_response_openai(model="gpt-4o-mini", prompt=f"Translate the following text to {target_language}:\n{row['response_a']}\nOnly return your translation, do not include any other text.")
        translated_response_b = get_response_openai(model="gpt-4o-mini", prompt=f"Translate the following text to {target_language}:\n{row['response_b']}\nOnly return your translation, do not include any other text.")

        return {
            "id": row["id"],
            "prompt": translated_prompt,
            "response_a": translated_response_a,
            "response_b": translated_response_b,
            "winner": row["winner"],
            "model_a": row["model_a"],
            "model_b": row["model_b"],
            "language": target_language
        }
    except Exception as e:
        print(f"Error with row {row['id']}: {e}")
        return {
            "id": row["id"],
            "prompt": row["prompt"],
            "response_a": row["response_a"],
            "response_b": row["response_b"],
            "winner": row["winner"],
            "model_a": row["model_a"],
            "model_b": row["model_b"],
            "language": row["language"]
        }


def process_language_chunk(language_id, chunk_data):
    target_language = languages[language_id]
    results = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(translate_row, row, target_language) for _, row in chunk_data.iterrows()]
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
    return results


for language_id in group_index:
    lmarena_ai_arena_human_preference_55k_chunk = lmarena_ai_arena_human_preference_55k[num_each_language * language_id: num_each_language * (language_id + 1)]
    translated_results = process_language_chunk(language_id, lmarena_ai_arena_human_preference_55k_chunk)
    translation_aug_lmarena_ai_arena_human_preference_55k = pd.concat([translation_aug_lmarena_ai_arena_human_preference_55k, pd.DataFrame(translated_results)], ignore_index=True)

In [None]:
translation_aug_lmarena_ai_arena_human_preference_55k.head()

In [None]:
translation_aug_lmarena_ai_arena_human_preference_55k["language"].value_counts()

In [16]:
translation_aug_lmarena_ai_arena_human_preference_55k.to_csv(f"translation_aug_equal_lmarena-ai-arena-human-preference-55k-{group_id}.csv", index=False)