In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
from utils import get_response_openai_prompt
import yaml
import pandas as pd
import os
import random
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [None]:
ds = load_dataset("O1-OPEN/OpenO1-SFT", token=hf_token)
df = Dataset.to_pandas(ds["train"])
print(df.shape)
df.head()

In [None]:
df = df.sample(1000, random_state=3407)
df = df[500:1000]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

In [None]:
ids = generate_unique_strings(len(df), 10)
print(f"Generated {len(ids)} Unique IDs.")

In [7]:
new_data = pd.DataFrame(columns=["prompt", "response_a", "response_b", "model_a", "model_b", "winner", "language"])

In [8]:
model = "gpt-4o-mini"

In [9]:
translation_system = "You are a great translator. You need to provide the most accurate and high-quality translations possible."
russian_system1 = "Вам нужно серьезно отвечать на вопросы пользователей, предоставляя точные и качественные ответы в пределах ваших возможностей."
russian_system2 = "Вам нужно поверхностно отвечать на вопросы пользователя, предоставляя неточные и низкокачественные ответы."
chinese_system1 = "你需要认真地回复用户的问题，力所能及地提供精确，高质量的回答。"
chinese_system2 = "你需要很敷衍地回答用户的问题，给出不精确的，低质量的答复。"
vitnamese_system1 = "Bạn cần trả lời các câu hỏi của người dùng một cách nghiêm túc, cung cấp câu trả lời chính xác và chất lượng cao trong khả năng của mình."
vitnamese_system2 = "Bạn cần trả lời câu hỏi của người dùng một cách hời hợt, đưa ra các câu trả lời không chính xác và chất lượng thấp."

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if idx < len(df) // 3:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Russian: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=russian_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=russian_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Russian"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Russian"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)

    elif idx < 2 * len(df) // 3:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Chinese: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=chinese_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=chinese_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Chinese"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Chinese"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)
    
    else:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Vitnamese: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=vitnamese_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=vitnamese_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Vitnamese"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Vitnamese"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)

In [None]:
new_data["id"] = ids
df = new_data.copy()
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

In [12]:
# df["response_a"][1], df["response_b"][1]
df.to_parquet("./self-generation-zh-ru-vi-2.parquet", index=False)