# Back Translation

This notebook creates parallel data by translating sentences from target languages to English using OpenAI's Batch API. It generates batch query files, submits them for processing, and formats the results into parallel corpus files.

## Setup

### Imports

In [None]:
import yaml
import os
import json
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv

### Configuration and Paths

In [None]:
# Load configuration
with open("../config.yaml") as f:
    config = yaml.safe_load(f)

# Set up project paths
project_root = Path.cwd().parent
SENTENCES_DIR = project_root / config["SENTENCES_DIR"]
API_QUERIES_DIR = project_root / config["API_QUERIES_DIR"]
PARALLEL_DATA_DIR = project_root / config["PARALLEL_DATA_DIR"]

# Translation configuration
API_URL = "/v1/chat/completions"
MODEL = config["data_processing"]["back_translation"]["model"]
SYSTEM_PROMPT = config["data_processing"]["back_translation"]["system_prompt"]
MAX_TOKENS = config["data_processing"]["back_translation"]["max_tokens"]

## Helper Functions

In [None]:
def create_translation_prompt(sentence: str, lang_name: str) -> str:
    """Create a translation prompt for the given sentence"""
    return f"Translate the following {lang_name} sentence into English:\n{sentence}"

## Step 1: Create Batch Query Files

Generate JSONL files with translation queries for the OpenAI Batch API.

In [None]:
for lang_code, lang_config in config["LANGUAGES"].items():
    lang_sents_file = SENTENCES_DIR / f"{lang_code}_sentences.jsonl"
    lang_queries_file = API_QUERIES_DIR / f"{lang_code}_queries.jsonl"

    with open(lang_sents_file, "r", encoding="utf-8") as file:
        lang_sents = [json.loads(line) for line in file]

    if not lang_sents:
        continue

    with open(lang_queries_file, "w", encoding="utf-8") as out_file:
        for idx, sent in enumerate(lang_sents):
            query_id = f"{lang_code}_{idx}"
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": create_translation_prompt(sent["text"], lang_config["name"])},
            ]
            query = {
                "custom_id": query_id,
                "method": "POST",
                "url": API_URL,
                "body": {
                    "model": MODEL,
                    "messages": messages,
                    "max_tokens": MAX_TOKENS,
                },
            }
            out_file.write(json.dumps(query, ensure_ascii=False) + "\n")

print("Batch query files created successfully!")

## Step 2: Submit Batch Jobs

Upload query files to OpenAI and submit batch translation jobs.

In [None]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_APIKEY"))

batch_info = {}

for lang_code, lang_config in config["LANGUAGES"].items():
    lang_queries_file = API_QUERIES_DIR / f"{lang_code}_queries.jsonl"

    if not lang_queries_file.exists():
        continue

    # Upload file
    batch_input_file = client.files.create(
        file=open(lang_queries_file, "rb"), purpose="batch"
    )

    # Create batch job
    batch = client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": f"backtranslation batch for {lang_code}"},
    )
    batch_info[lang_code] = batch
    print(f"Submitted batch for {lang_code}: {batch.id}")

print(f"\nSubmitted {len(batch_info)} batch jobs successfully!")

## Step 3: Check Batch Status

Monitor the progress of submitted batch jobs.

In [None]:
for key, batch in batch_info.items():
    batch = client.batches.retrieve(batch.id)
    batch_info[key] = batch
    counts = batch.request_counts
    print(
        f"{key}: status={batch.status}, completed={counts.completed}, failed={counts.failed}, total={counts.total}"
    )

th: status=completed, completed=12, failed=0, total=12
et: status=completed, completed=105, failed=0, total=105


## Step 4: Retrieve Results and Create Parallel Data

Download completed translations and create parallel corpus files.

In [None]:
# Retrieve batch responses
batch_responses = {}
for key, batch in batch_info.items():
    file_response = client.files.content(batch.output_file_id)
    batch_responses[key] = [
        json.loads(res) for res in file_response.text.split("\n") if res
    ]

# Create parallel data files
for lang_code, lang_config in config["LANGUAGES"].items():
    lang_sents_file = SENTENCES_DIR / f"{lang_code}_sentences.jsonl"
    parallel_sents_file = PARALLEL_DATA_DIR / f"{lang_code}-en_data.jsonl"

    with open(lang_sents_file, "r", encoding="utf-8") as file:
        lang_sents = [json.loads(line) for line in file]

    if not lang_sents:
        continue

    if lang_code not in batch_responses.keys():
        continue

    # Extract translations
    translated_sentences = [
        res["response"]["body"]["choices"][0]["message"]["content"]
        for res in batch_responses[lang_code]
    ]
    
    assert len(translated_sentences) == len(lang_sents), (
        f"Mismatch: {len(translated_sentences)} translations vs {len(lang_sents)} source sentences"
    )

    # Write parallel data
    with open(parallel_sents_file, "w", encoding="utf-8") as outfile:
        for target, source in zip(lang_sents, translated_sentences):
            outfile.write(
                json.dumps(
                    {
                        "target_text": target["text"],
                        "target_lang": lang_code,
                        "source_text": source,
                        "source_lang": "en",
                        "doc_id": target["doc_id"],
                        "sent_id": target["sent_id"],
                    },
                    ensure_ascii=False,
                )
                + "\n"
            )
    
    print(f"Created parallel data for {lang_code}: {len(translated_sentences)} sentence pairs")

print("\nParallel data creation complete!")