# Dataset Translation Example (using HuggingFace Datasets as a template)

## Initial TranslationDataGenerator

In [None]:
# (Optional) Import BaseSettings from pydantic for configuration management
from pydantic.v1 import BaseSettings

# (Optional) Define a Settings class to store model and API configuration
class Settings(BaseSettings):
    hf_token: str = None                        # (Change this) HuggingFace token for authentication
    hf_model: str = "openai/gpt-oss-20b"        # (Change this) HuggingFace model for tokenization
    model_name: str = "gpt-oss:latest"          # (Change this) Local model name
    base_url: str = "http://localhost:11434/v1" # (Change this) Base URL for local API (P.S: Ollama supports the OpenAI API format.)

# Instantiate the Settings object to access configuration
setting = Settings()

# Import the TranslationDataGenerator for synthetic translation tasks
from nemo_curator.synthetic.translate import TranslationDataGenerator

# Create a TranslationDataGenerator instance with specified parameters
generator = TranslationDataGenerator(
    base_url=setting.base_url,                              # API endpoint
    api_key="",                                             # API key (empty if not required)
    init_translate_model=setting.model_name,                # Initial translation model
    reflection_model=setting.model_name,                    # Reflection model for improvement
    improvement_model=setting.model_name,                   # Model for translation improvement
    hf_tokenizer=setting.hf_model,                          # Tokenizer model from HuggingFace
    hf_token=setting.hf_token if setting.hf_token != "" else None, # HuggingFace authentication token
    temperature=1.0,                                        # Sampling temperature for generation
    top_p=1.0,                                              # Nucleus sampling parameter
    max_tokens=24576,                                        # Maximum tokens for input
    stop=["<|return|>","<|endoftext|>", "<|call|>"],        # Stop TOKEN sequences
    max_token_per_chunk=5000,                               # Max tokens per chunk for translation
    source_lang="English",                                  # Source language
    target_lang="Traditional Chinese",                      # Target language
    country="Taiwan",                                       # (Optional) Country context for translation
)

## Load & Translate Sample Dataset

In [2]:
# Import required libraries for data processing and translation
import pandas as pd
from datasets import load_dataset
import re
import json

# Load the Sample Dataset from HuggingFace using the provided token
ds = load_dataset("TsukiOwO/Translation-Sample-Dataset", "default", token=setting.hf_token if setting.hf_token != "" else None)["train"]
messages_zh_all = []    # List to store translated message sets
max_examples = 1        # (Optional) Limit the number of examples processed

# Iterate through filtered examples and translate messages
for idx, example in enumerate(ds):
    # (Optional) Set a Limit for example
    if idx >= max_examples:
        break
    
    print(f"Processing example {idx + 1}/{max_examples}...") 

    messages = example['messages']  # Get the list of messages for the example
    messages_zh = []  # List to store translated messages for this example

    for msg in messages:
        role = msg.get('role')  # Get the role (user or assistant)
        content = msg.get('content', '')  # Get the message content

        if role == 'user':
            # Translate user message content
            translations = generator.generate(content, debug=True) # (Optional) debug=True: Can get the `extract_content` step information 
            translated = generator.parse_response(translations)
            messages_zh.append({"role": role, "content": translated})

        elif role == 'assistant':
            # Extract and translate <think> content if present
            think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
            if think_match:
                think_text = think_match.group(1).strip()
                translations = generator.generate(think_text)
                think_translated = generator.parse_response(translations)
            else:
                think_translated = ""
            
            # Extract and translate non-<think> content
            result_text = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
            translations = generator.generate(result_text)
            result_translated = generator.parse_response(translations)
            # Merge translated <think> and non-<think> content
            combined_content = f"<think>\n{think_translated}</think>\n\n{result_translated}"
            messages_zh.append({"role": "assistant", "content": combined_content})
            messages_zh_all.append(messages_zh)  # Add translated messages for this example

# Create a DataFrame to store all translated message sets
df = pd.DataFrame()
df['messages_zh'] = messages_zh_all

Processing example 1/1...
EXTRACTED: 3. (6 分) 一家建設公司正在建造隧道。當隧道完成了三分之一時，他們開始使用新設備，該設備將施工速度提升 20%，並將工作時間削減至原來的 80%。最終完成隧道共花費 185 天。若不使用新設備，繼續以原速施工，完成隧道仍需 ______ 天。


## Show The Result

In [3]:
# Print the translated messages for the first example in formatted JSON
print(json.dumps(df['messages_zh'].iloc[0], ensure_ascii=False, indent=2))

[
  {
    "role": "user",
    "content": "3. (6 分) 一家建設公司正在建造隧道。當隧道完成了三分之一時，他們開始使用新設備，該設備將施工速度提升 20%，並將工作時間削減至原來的 80%。最終完成隧道共花費 185 天。若不使用新設備，繼續以原速施工，完成隧道仍需 ______ 天。"
  },
  {
    "role": "assistant",
    "content": "<think>\n先來分析這個問題。\n\n一家建設公司正在建造一條隧道。完成隧道的三分之一後，他們改用新設備。該設備使施工速度提高 20%，但每日工作時數被削減至原來的 80%。使用新設備後，總共耗時 185 天。請問若不使用新設備，改以原始速度繼續建造，總共會耗時多少天？\n\n設全長為 1 個隧道。  \n原始施工速率為 \\(R\\)（隧道/天），若不變則需完成整條隧道的時間為  \n\\[\nT=\\frac{1}{R}\\quad(\\text{天}) .\n\\]\n\n**第一階段**  \n以原速率完成三分之一，所需時間  \n\\[\nt_1=\\frac{\\tfrac13}{R}=\\frac{1}{3R}\\quad(\\text{天}) .\n\\]\n\n**第二階段**  \n剩餘三分之二按新設備施工。新設備使速率提升 20%，即從 \\(R\\) 變為 \\(1.2R\\)。  \n原本每日工作 \\(H\\) 小時，提升後速率為每小時 \\(\\tfrac{R}{H}\\)。提高 20% 之後為 \\(1.2\\tfrac{R}{H}\\)。  \n但每日工作時數下降至 \\(0.8H\\)，因此每日新速率為  \n\\[\n1.2\\tfrac{R}{H}\\times 0.8H = 0.96R \\quad(\\text{隧道/天}) .\n\\]\n此階段所需時間  \n\\[\nt_2=\\frac{\\tfrac23}{0.96R}\\quad(\\text{天}) .\n\\]\n\n已知總時間為 185 天，所以  \n\\[\nt_1+t_2 = \\frac{1}{3R}+\\frac{\\tfrac23}{0.96R}=185 .\n\\]\n\n化簡分子：  \n\\[\n\\fr