In [None]:
import json
import yaml
from pathlib import Path

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

project_root = Path.cwd().parent
SENTENCES_DIR = project_root / config["SENTENCES_DIR"]
API_QUERIES_DIR = project_root / config["API_QUERIES_DIR"]

In [None]:
# Create batch input files for simple per-sentence translation
API_URL = "/v1/chat/completions"
MODEL = config["data_processing"]["back_translation"]["model"]
SYS_PROMPT = config["data_processing"]["back_translation"]["system_prompt"]
MAX_TOKENS = config["data_processing"]["back_translation"]["max_tokens"]


def prompt(sentence: str, lang_name: str) -> str:
    return (
        f"Translate the following {lang_name} sentence into English:\n{sentence}"
    )


# Simple per-sentence translation
for lang_code, lang_config in config["LANGUAGE"].items():
    lang_sents_file = SENTENCES_DIR / f"{lang_code}_sentences.jsonl"
    lang_queries_file = API_QUERIES_DIR / f"{lang_code}_queries.jsonl"

    with open(lang_sents_file, "r", encoding="utf-8") as file:
        lang_sents = [json.loads(line) for line in file]

    if not lang_sents:
        continue

    queries = []
    with open(lang_queries_file, "w", encoding="utf-8") as out_file:
        for idx, sent in enumerate(lang_sents):
            query_id = f"{lang_code}_{idx}"
            messages = [
                {"role": "system", "content": SYS_PROMPT},
                {"role": "user", "content": prompt(sent["text"], lang_config["name"])},
            ]
            query = {
                "custom_id": query_id,
                "method": "POST",
                "url": API_URL,
                "body": {
                    "model": MODEL,
                    "messages": messages,
                    "max_tokens": MAX_TOKENS,
                },
            }
            out_file.write(json.dumps(query, ensure_ascii=False) + "\n")
