# Translating of monastery and building labels and descriptions

These cells below just define things like how many times a batch should be retried in case of an exception or whether to output debugging output.

It's not necessary to change anything here, so you can also just execute these cells and continue further down to the code cells doing the actual translating (**but don't execute the translation cells before reading through the notes there!**)

In [None]:
import polars as pl
from openai import OpenAI
import os
import json
import re
import time
import traceback
from ratelimit import limits, sleep_and_retry

# API configuration
base_url = "https://chat-ai.academiccloud.de/v1"
model = "openai-gpt-oss-120b"
api_key = ""
ONE_MINUTE = 60
STEP_SIZE = 100
MAX_BATCH_ATTEMPTS = 10
MAX_ROW_ATTEMPTS = 5
DEBUG = False

client = OpenAI(api_key=api_key, base_url=base_url)

# read provided translations
provided_translations = pl.read_csv("data/translation/translation_dictionary.csv", separator=";")

SYSTEM_PROMPT = """**Role:** You are a professional translator specializing in historical and religious terminology, with expertise in German–English translation. You will receive a *label* and a *description* each for a monastery and for the building complex of the monastery. Your task is to return the most accurate and context-appropriate English translation, following the rules below.

**Instructions:**
1. **Primary task:** Translate both *labels* and *descriptions* from German to English.  
2. **Format:** Return the translated labels and descriptions in the format: 'translated monastery label###translated monastery description###translated building label###translated building description'
3. **Translation list priority:** Use the provided glossary exactly when a German term matches an entry in the form `German term#English translation`. Never deviate from these glossary translations.  
4. **Do not translate**:
   - Proper names of locations (e.g., *Aura an der Saale*), unless an established English version exists.
   - Street names (e.g., *Brotstraße*).
5. **Grammar:** Insert 'of' in front of proper names if this is grammatically correct. 
6. **Dates:** Copy dates exactly without modification.
7. **Other terms:** Translate as you normally would, while maintaining the style and tone suitable for historical/religious texts.
8. **Building Complex labels and descriptions:** Always start your translations of the label and description of the building complex with 'Building complex' 
9. **Glossary:**  
   ```"""

for row in provided_translations.iter_rows():
    german = row[0]
    english = row[1]

    SYSTEM_PROMPT += f"\n   {german}#{english}"

SYSTEM_PROMPT += "\n   ```"

In [7]:
@sleep_and_retry
@limits(calls=20, period=ONE_MINUTE)
def get_translation(user_prompt: str):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
            {"role": "user", "content": user_prompt},
        ],
        model=model,
        temperature=0,
    )
    return chat_completion.model_dump()

In [8]:
def translate_batch(labels_and_descriptions: pl.DataFrame, start: int, end: int):
    monastery_lde = []
    monastery_dde = []
    building_lde = []
    building_dde = []
    monastery_len = []
    monastery_den = []
    building_len = []
    building_den = []

    index = 0
    for row_index in range(start, end):
        row = labels_and_descriptions.row(index=row_index, named=True)

        building_label = row["building_Lde"]
        building_description = row["building_Dde"]
        monastery_label = row["monastery_Lde"]
        monastery_description = row["monastery_Dde"]

        user_prompt = f"monastery label: '{monastery_label}'\nmonastery description: '{monastery_description}\nbuilding label: {building_label}\nbuilding description: {building_description}"

        successful = False
        attempt = 0
        while not successful and attempt < MAX_ROW_ATTEMPTS:
            attempt += 1
            if attempt > 1 and DEBUG:
                print(f"attempt #{attempt} at row #{index}", flush=True)

            dump = get_translation(user_prompt)
            output = dump["choices"][0]["message"]["content"]

            if matches := re.search(
                pattern=r"([^#]+)###([^#]+)###([^#]+)###([^#]+)", string=output
            ):
                if matches.group(3).startswith("Building complex") and matches.group(
                    4
                ).startswith("Building complex"):
                    monastery_len.append(matches.group(1))
                    if matches.group(2) != "None":
                        monastery_den.append(matches.group(2))
                    else:
                        monastery_den.append(None)
                    building_len.append(matches.group(3))
                    building_den.append(matches.group(4))
                    successful = True
                elif DEBUG:
                    print(
                        f"retrying because the label and/or description of the building complex does not start with 'Building complex'. Output: '{output}'"
                    )
            elif DEBUG:
                print(
                    f"retrying because the output does not follow the specified format: '{output}'"
                )

        monastery_lde.append(monastery_label)
        monastery_dde.append(monastery_description)
        building_lde.append(building_label)
        building_dde.append(building_description)

        if DEBUG:
            print(f"row {index} done", flush=True)

        index += 1

    batch_output = pl.DataFrame(
        {
            "monastery_Lde": monastery_lde,
            "monastery_Len": monastery_len,
            "monastery_Dde": monastery_dde,
            "monastery_Den": monastery_den,
            "building_Lde": building_lde,
            "building_Len": building_len,
            "building_Dde": building_dde,
            "building_Den": building_den,
        },
        schema={
            "monastery_Lde": pl.String,
            "monastery_Len": pl.String,
            "monastery_Dde": pl.String,
            "monastery_Den": pl.String,
            "building_Lde": pl.String,
            "building_Len": pl.String,
            "building_Dde": pl.String,
            "building_Den": pl.String,
        },  # NOTE: necessary because otherwise the concatenation might throw an error (when the first dataframe has a null value for the monastery description columns)
    )

    print(f"batch of rows {start} to {end} done!", flush=True)

    return batch_output


## actual translation part
The code cell below does the actual translating by calling the functions defined above.

### things to note
1. the model currently being used (`OpenAI GPT OSS 120B`) seems to take on average 13 seconds per translation of one row containing four items (monastery and building labels)
2. should you first/just want to **test the translating**, you can use the line below, where n is the size of the sample to test:
    ```
    to_translate = to_translate.sample(n=3)
    ```
3. the function below translates in batches, to make sure that should an error occur, not that much progress is lost.
4. the function reports whenever a batch is done <!--(and if the DEBUG constant is set [you can find it in the first code cell] a lot more is reported)-->
5. the size of a batch is set to 100, so expect an output from the function around every 21 minutes
6. if you want to translate 6000 rows, expect this code cell to run for 21 hours
    - should you not wish to leave the code cell running overnight, but have a large amount of rows to translate, you can also create slices of the dataframe (e.g. slice the dataframe into four) and translate these one after another with the line below. You can then later easily paste the outputs together into one file.
        ```
        to_translate = to_translate.slice(offset=0, length=1500)
        ```
    - should you decide to do that: do not forget to move the result of the previous slice before the next slice is finished because otherwise it will overwrite the previous slice's result

In [9]:
# read data to translate
to_translate = pl.read_csv("data/translation/to_translate.csv")

# NOTE: if you just want to test everything works and not translate the whole dataset uncomment this line
#to_translate = to_translate.sample(n=10)

# NOTE: if you want to translate the entire dataset, but it's huge and you don't want to leave the code cell running overnight use this line
# with a dataframe of 6000 rows, if you split it into two, for the first half the parameters would need to be (offset=0, length=1500) and for the second half: (offset=1500, length=1500)
#to_translate = to_translate.slice(offset=0, length=1500)

In [10]:
batch_outputs = []
start = 0

while start < to_translate.height:
    end = (
        start + STEP_SIZE
        if start + STEP_SIZE < to_translate.height
        else to_translate.height
    )
    attempt = 0
    done = False

    while not done and attempt < MAX_BATCH_ATTEMPTS:
        try:
            attempt += 1
            if attempt > 1:
                print(
                    f"attempt #{attempt} for batch of rows {start} to {end}", flush=True
                )

            batch_output = translate_batch(to_translate, start, end)
            batch_outputs.append(batch_output)
            done = True

        except Exception as e:
            if DEBUG:
                print(traceback.format_exc())

    start = end

print("all batches done")

final_output = pl.concat(batch_outputs)
final_output.write_csv("data/translation/translated.csv")


batch of rows 0 to 66 done!
all batches done
