In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
from openai import OpenAI
import random
import secrets
import string
import requests
import os
import yaml

In [None]:
config_path = os.path.join(os.getcwd(), "..", "config.yaml")

with open(config_path, "r") as file:
    config = yaml.safe_load(file)

openai_api_key = config["openai"]["api_key"]
openai_organization = config["openai"]["organization"]
hf_token = config["huggingface"]["token"]
client_openai = OpenAI(api_key=openai_api_key, organization=openai_organization)

In [2]:
start_idx = 0
end_idx = 3

models = [
    "mistralai/Mistral-Nemo-Instruct-2407",
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    "Qwen/Qwen2.5-72B-Instruct",
    "01-ai/Yi-1.5-34B-Chat",
    "microsoft/Phi-3.5-mini-instruct",
    "google/gemma-2-27b-it",
    "google/gemma-2-9b-it",
    "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "meta-llama/Llama-3.1-70B-Instruct",
    "Qwen/Qwen2.5-72B-Instruct",
    "gpt-4o-mini",
    "gpt-3.5-turbo-0125",
    "gpt-4o"
]

In [3]:
def get_response_openai_prompt(model, prompt, temperature=0.7, top_p=0.95, max_tokens=1024):
    completion = client_openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        top_p=top_p,
        max_completion_tokens=max_tokens
    )
    response = completion.choices[0].message.content.strip()
    return response

In [4]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [None]:
ds = load_dataset("wenbopan/Chinese-dpo-pairs")
df = Dataset.to_pandas(ds["train"])
print(df.shape)
df.head()

In [None]:
zh_df = df.copy()
zh_df = zh_df.sample(frac=1.0, random_state=7)
zh_df = zh_df[start_idx:end_idx]
zh_df.reset_index(drop=True, inplace=True)
print(zh_df.shape)

zh_df = zh_df[["prompt"]]

prompts = []
response_as = []
response_bs = []
model_as = []
model_bs = []
languages = []

for idx, row in tqdm(zh_df.iterrows(), total=len(zh_df)):
    try:
        prompt = row["prompt"]
        print("Prompt:", prompt)
        selected_models = random.sample(models, 2)
        print("Selected Models:", selected_models)

        for order, selected_model in enumerate(selected_models):
            if "gpt" in selected_model and order == 0:
                response_a = get_response_openai_prompt(model=selected_model, prompt=prompt)
                print("OpenAI Response A:", response_a)
                model_a = selected_model
            elif "gpt" in selected_model and order == 1:
                response_b = get_response_openai_prompt(model=selected_model, prompt=prompt)
                print("OpenAI Response B:", response_b)
                model_b = selected_model
            elif "gpt" not in selected_model and order == 0:
                API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
                headers = {"Authorization": f"Bearer {hf_token}"}
                payload = {"inputs": prompt}
                response_a = requests.post(API_URL, headers=headers, json=payload).json()[0]["generated_text"]
                print("HF Response A:", response_a)
                model_a = selected_model.split("/")[-1]
            elif "gpt" not in selected_model and order == 1:
                API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
                headers = {"Authorization": f"Bearer {hf_token}"}
                payload = {"inputs": prompt}
                response_b = requests.post(API_URL, headers=headers, json=payload).json()[0]["generated_text"]
                print("HF Response B:", response_b)
                model_b = selected_model.split("/")[-1]

        languages.append("Chinese")
        prompts.append(prompt)
        response_as.append(response_a)
        response_bs.append(response_b)
        model_as.append(model_a)
        model_bs.append(model_b)
        
    except Exception as e:
        print(f"Error: {e} Row {idx}")

zh_df.drop(columns=["prompt"], inplace=True)
zh_df["id"] = generate_unique_strings(count=len(prompts), length=15)
zh_df["prompt"] = prompts
zh_df["response_a"] = response_as
zh_df["response_b"] = response_bs
zh_df["model_a"] = model_as
zh_df["model_b"] = model_bs
zh_df["language"] = languages

zh_df = zh_df[["id", "prompt", "response_a", "response_b", "model_a", "model_b", "language"]]
zh_df.to_parquet(f"huggingface_api_Chinese-dpo-pairs_zh_{start_idx}_{end_idx}.parquet", index=False)
zh_df.head()