In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
from utils import get_response_openai_prompt
import yaml
import pandas as pd
import os
import random
import secrets
import string

In [2]:
def generate_unique_strings(count, length):
    unique_strings = set()
    characters = string.ascii_letters + string.digits
    while len(unique_strings) < count:
        random_string = "".join(secrets.choice(characters) for _ in range(length))
        unique_strings.add(random_string)
    return list(unique_strings)

In [3]:
def load_config(config_file):
    with open(config_file, "r") as file:
        config = yaml.safe_load(file)
    return config

config_path = os.path.join(os.getcwd(), "..", "config.yaml")
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]

In [4]:
ds = load_dataset("O1-OPEN/OpenO1-SFT", token=hf_token)
df = Dataset.to_pandas(ds["train"])
df.head()

Unnamed: 0,instruction,output
0,Consider a regular octagon. How many different...,"<Thought>\nAlright, I need to figure out how m..."
1,Create a Python class that encodes a given num...,"<Thought>\nAlright, I need to create a Python ..."
2,Write a Python function named `is_valid_binary...,"<Thought>\nAlright, I need to write a Python f..."
3,Let X be a subset of a Banach space Y with the...,A nice result in functional analysis!\n\nThe s...
4,Identify the correct option for document fetch...,"<Thought>\nAlright, the task at hand is to ide..."


In [5]:
df = df.sample(500, random_state=3407)
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

(500, 2)


Unnamed: 0,instruction,output
0,"Using the jQuery library, write a snippet to a...","<Thought>\nAlright, I need to write a jQuery s..."
1,Suppose your friend has entered a chess tourna...,Here is a MATLAB code that calculates the prob...
2,"On a recent day in Oregon, at 9 a.m. the tempe...","<Thought>\nAlright, let's dive into this probl..."
3,Derivation on Hom Functor for Commutative Diag...,A classic result in homological algebra!\n\nLe...
4,I am making a script for automated tests. But ...,"<Thought>\nAlright, I need to help the user cr..."


In [6]:
ids = generate_unique_strings(len(df), 11)
print(f"Generated {len(ids)} Unique IDs.")

Generated 500 Unique IDs.


In [7]:
new_data = pd.DataFrame(columns=["prompt", "response_a", "response_b", "model_a", "model_b", "winner", "language"])

In [8]:
model = "gpt-4o-mini"

In [9]:
translation_system = "You are a great translator. You need to provide the most accurate and high-quality translations possible."
russian_system1 = "Вам нужно серьезно отвечать на вопросы пользователя, предоставляя максимально точные и качественные ответы."
russian_system2 = "Вам нужно поверхностно отвечать на вопросы пользователя, предоставляя неточные и низкокачественные ответы."
chinese_system1 = "你需要认真回复用户的问题，力所能及地提供精确，高质量的回答。"
chinese_system2 = "你需要很敷衍地回答用户的问题，给出不精确的，低质量的答复。"
vitnamese_system1 = "Bạn cần nghiêm túc trả lời câu hỏi của người dùng, cung cấp câu trả lời chính xác và chất lượng cao nhất có thể."
vitnamese_system2 = "Bạn cần trả lời câu hỏi của người dùng một cách hời hợt, đưa ra các câu trả lời không chính xác và chất lượng thấp."

In [10]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if idx < len(df) // 3:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Russian: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=russian_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=russian_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Russian"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Russian"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)

    elif idx < 2 * len(df) // 3:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Chinese: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=chinese_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=chinese_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Chinese"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Chinese"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)
    
    else:
        instruction = row["instruction"]

        translated_instruction = get_response_openai_prompt(
            model=model,
            system=translation_system,
            prompt=f"Translate the following instruction to Vitnamese: {instruction}.\nYou should only return the translation."
        )
        if translated_instruction is None:
            continue

        chosen_response = get_response_openai_prompt(
            model=model,
            system=vitnamese_system1,
            prompt=translated_instruction,
            max_tokens=512
        )
        if chosen_response is None:
            continue

        rejected_response = get_response_openai_prompt(
            model=model,
            system=vitnamese_system2,
            prompt=translated_instruction,
            max_tokens=512
        )
        if rejected_response is None:
            continue

        if random.random() < 0.5:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": chosen_response,
                "response_b": rejected_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_a",
                "language": "Vitnamese"
            }])
        else:
            new_row = pd.DataFrame([{
                "prompt": translated_instruction,
                "response_a": rejected_response,
                "response_b": chosen_response,
                "model_a": model,
                "model_b": model,
                "winner": "model_b",
                "language": "Vitnamese"
            }])

        new_data = pd.concat([new_data, new_row], ignore_index=True)

100%|██████████| 500/500 [2:00:47<00:00, 14.49s/it]  


In [11]:
new_data["id"] = ids
df = new_data.copy()
df = df[["id", "prompt", "response_a", "response_b", "winner", "model_a", "model_b", "language"]]
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()

(500, 8)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,tKegigmKje9,"Используя библиотеку jQuery, напишите фрагмент...","Вот пример кода на jQuery, который делает то, ...","Вот пример кода на jQuery, который выполняет в...",model_b,gpt-4o-mini,gpt-4o-mini,Russian
1,JJOXqqi6kk8,"Предположим, что ваш друг участвует в шахматно...",Конечно! Чтобы рассчитать вероятность победы в...,"Конечно! Вот простой пример кода на MATLAB, ко...",model_a,gpt-4o-mini,gpt-4o-mini,Russian
2,iTaiVE8eXQz,В недавний день в Орегоне в 9 утра температура...,"Чтобы найти, насколько теплее было в 2 часа дн...",Если в 9 утра температура была 50 градусов по ...,model_a,gpt-4o-mini,gpt-4o-mini,Russian
3,cxlSWSlhf7Z,Производная на функторе Хом для коммутативных ...,Ваша просьба касается довольно специфической т...,"Чтобы понять, почему коммутативность диаграммы...",model_b,gpt-4o-mini,gpt-4o-mini,Russian
4,bNxFy9H1WHJ,Я пишу скрипт для автоматизированных тестов. Н...,"Конечно! Вот простой bash-скрипт, который пров...",Конечно! Вы можете использовать следующий bash...,model_b,gpt-4o-mini,gpt-4o-mini,Russian


In [12]:
# df["response_a"][1], df["response_b"][1]
df.to_parquet("./self-generation-zh-ru-vi.parquet", index=False)