In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import random
import secrets
import string
import requests

In [2]:
hf_token = ""

round = 1

start_idx = 0
end_idx = 1

models = [
    "mistralai/Mistral-Nemo-Instruct-2407",
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    "Qwen/Qwen2.5-72B-Instruct",
    "01-ai/Yi-1.5-34B-Chat",
    "microsoft/Phi-3.5-mini-instruct",
    "google/gemma-2-27b-it",
    "google/gemma-2-9b-it",
    "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "meta-llama/Llama-3.1-70B-Instruct",
    "Qwen/Qwen2.5-72B-Instruct",
]

In [3]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [None]:
ds = load_dataset("CohereForAI/aya_dataset")
df = Dataset.to_pandas(ds["train"])
print(df.shape)
df.head()

In [None]:
df["language"].unique()

In [None]:
vie_df = df[df["language"] == "Vietnamese"]
print(vie_df.shape)
vie_df.reset_index(drop=True, inplace=True)

vie_df = vie_df.sample(frac=1.0, random_state=7)
vie_df = vie_df[start_idx:end_idx]
vie_df.reset_index(drop=True, inplace=True)
print(vie_df.shape)

vie_df = vie_df[["inputs", "targets", "language"]]

ids = generate_unique_strings(len(vie_df), 25)
prompts = []
response_as = []
response_bs = []
model_as = []
model_bs = []

for idx, row in tqdm(vie_df.iterrows(), total=len(vie_df)):
    try:
        prompt = row["inputs"]
        response_a = row["targets"]
        model_a = "Unknown"
        model_b_name = random.choice(models)
        model_b = model_b_name.split("/")[-1]

        API_URL = f"https://api-inference.huggingface.co/models/{model_b_name}"
        headers = {"Authorization": f"Bearer {hf_token}"}
        payload = {"inputs": prompt}

        response = requests.post(API_URL, headers=headers, json=payload)
        response_b = response.json()[0]["generated_text"]

        prompts.append(prompt)
        response_as.append(response_a)
        response_bs.append(response_b)
        model_as.append(model_a)
        model_bs.append(model_b)
    except Exception as e:
        print(f"Error: {e} Row {idx}")

vie_df.drop(columns=["inputs", "targets"], inplace=True)
vie_df["id"] = ids
vie_df["prompt"] = prompts
vie_df["response_a"] = response_as
vie_df["response_b"] = response_bs
vie_df["model_a"] = model_as
vie_df["model_b"] = model_bs

vie_df = vie_df[["id", "prompt", "response_a", "response_b", "model_a", "model_b", "language"]]
vie_df.to_parquet(f"huggingface_api_vie_{round}_{start_idx}_{end_idx}.parquet", index=False)
vie_df.head()

In [None]:
zh_df = df[df["language"] == "Simplified Chinese"]
print(zh_df.shape)
zh_df.reset_index(drop=True, inplace=True)

zh_df = zh_df.sample(frac=1.0, random_state=7)
zh_df = zh_df[start_idx:end_idx]
zh_df.reset_index(drop=True, inplace=True)
print(zh_df.shape)

zh_df = zh_df[["inputs", "targets"]]

ids = generate_unique_strings(len(zh_df), 26)
prompts = []
response_as = []
response_bs = []
model_as = []
model_bs = []
languages = []

for idx, row in tqdm(zh_df.iterrows(), total=len(zh_df)):
    try:
        prompt = row["inputs"]
        response_a = row["targets"]
        model_a = "Unknown"
        model_b_name = random.choice(models)
        model_b = model_b_name.split("/")[-1]

        API_URL = f"https://api-inference.huggingface.co/models/{model_b_name}"
        headers = {"Authorization": f"Bearer {hf_token}"}
        payload = {"inputs": prompt}

        response = requests.post(API_URL, headers=headers, json=payload)
        response_b = response.json()[0]["generated_text"]

        prompts.append(prompt)
        response_as.append(response_a)
        response_bs.append(response_b)
        model_as.append(model_a)
        model_bs.append(model_b)
        languages.append("Chinese")
    except Exception as e:
        print(f"Error: {e} Row {idx}")

zh_df.drop(columns=["inputs", "targets"], inplace=True)
zh_df["id"] = ids
zh_df["prompt"] = prompts
zh_df["response_a"] = response_as
zh_df["response_b"] = response_bs
zh_df["model_a"] = model_as
zh_df["model_b"] = model_bs
zh_df["language"] = languages

zh_df = zh_df[["id", "prompt", "response_a", "response_b", "model_a", "model_b", "language"]]
zh_df.to_parquet(f"huggingface_api_zh_{round}_{start_idx}_{end_idx}.parquet", index=False)
zh_df.head()

In [None]:
ru_df = df[df["language"] == "Russian"]
print(ru_df.shape)
ru_df.reset_index(drop=True, inplace=True)

ru_df = ru_df.sample(frac=1.0, random_state=7)
ru_df = ru_df[start_idx:end_idx]
ru_df.reset_index(drop=True, inplace=True)
print(ru_df.shape)

ru_df = ru_df[["inputs", "targets", "language"]]

ids = generate_unique_strings(len(ru_df), 27)
prompts = []
response_as = []
response_bs = []
model_as = []
model_bs = []

for idx, row in tqdm(ru_df.iterrows(), total=len(ru_df)):
    try:
        prompt = row["inputs"]
        response_a = row["targets"]
        model_a = "Unknown"
        model_b_name = random.choice(models)
        model_b = model_b_name.split("/")[-1]

        API_URL = f"https://api-inference.huggingface.co/models/{model_b_name}"
        headers = {"Authorization": f"Bearer {hf_token}"}
        payload = {"inputs": prompt}

        response = requests.post(API_URL, headers=headers, json=payload)
        response_b = response.json()[0]["generated_text"]

        prompts.append(prompt)
        response_as.append(response_a)
        response_bs.append(response_b)
        model_as.append(model_a)
        model_bs.append(model_b)
    except Exception as e:
        print(f"Error: {e} Row {idx}")

ru_df.drop(columns=["inputs", "targets"], inplace=True)
ru_df["id"] = ids
ru_df["prompt"] = prompts
ru_df["response_a"] = response_as
ru_df["response_b"] = response_bs
ru_df["model_a"] = model_as
ru_df["model_b"] = model_bs

ru_df = ru_df[["id", "prompt", "response_a", "response_b", "model_a", "model_b", "language"]]
ru_df.to_parquet(f"huggingface_api_ru_{round}_{start_idx}_{end_idx}.parquet", index=False)
ru_df.head()