In [1]:
import asyncio
from typing import Any, Dict, List, TypedDict
import pandas as pd
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, START, END
from langchain.text_splitter import RecursiveCharacterTextSplitter

from prompt_templates import text_translation_template
from utilities import batch_list

load_dotenv() # Loading the API key from a .env file.

True

In [2]:
llm = ChatOpenAI(
    model="gpt-5" # Cost and latency are not that much of a factor at this text size, hence the choice of a model. 
)

### Text Processing

- The book is split into c.a. 40 chunks using recursive splitting.
- The paragraphs of the text are relatively short and the chunk size chosen preserves them.

In [3]:
with open("text_clean.txt", "r") as file:
  text = file.read()

text_splitter = RecursiveCharacterTextSplitter(
    #the below setup splits the text consistently in 1-2 paragraphs, always after sentence completion
    separators=["\n\n\n", "\n\n", "\n", "."],
    chunk_size=2_500,
    chunk_overlap=0,
    length_function=len,
)

chunks = text_splitter.split_text(text)
print(len(chunks))

39


### Constructing Names Dictionary

- After auditing the names translation, a dictionary is put together.
- Each entry is structured as \<name in English\> → \<name in Bulgarian\>.
- The goal is to inject this information into the text_translation_template prompt to aid the translation.
- The format aims to be human readable.

In [4]:
names_df = pd.read_excel("names_translation_audited.xlsx")
names_dict = names_df.loc[:,["Name in English", "Final Translation"]].to_dict(orient="index")
names_dict = {v["Name in English"]:v["Final Translation"] for _, v in names_dict.items()}

names_dictionary = ""

for k, v in names_dict.items():
    names_dictionary += f"{k} → {v}\n"

print(f"Dictionary examples:\n\n{names_dictionary[:500]}")

Dictionary examples:

_Alcis_ → Алки
_Aurinia_ → Ауриния
_Boiemum_ → Боемум
_Castum_ → Кастум
_Germans_ → Германи
_Isis_ → Изида
_Mannus_ → Манус
_Tuisto_ → Туисто
_Veleda_ → Веледа
Abnoba → Абноба
AEstyan nations → естийски народи
Africa → Африка
Agricola → Агрикола
Agrippinensians → агрипи
Alcis → Алкис
Angles → англи
Angrivarians → ангриварии
Araviscans → аравискани
Arians → ари
Arsacides → Аршакиди
Asciburgium → Аскибургий
Asia → Азия
Augustus → Август
Aurinia → Ауриния
Aviones → авиони
Basstarnians → бастарни
Ba


### Chunk Translation

 - Each chunk passes through chunk_translation_app to be translated into Bulgarian.
 - global_translation_app is used for parallel execution.

In [5]:
class ChunkTranslation(TypedDict):
    original_text: str
    translated_text: str

async def translate_chunk(state: ChunkTranslation) -> Dict[str, str]:
    prompt = text_translation_template.format_messages(names_dictionary = names_dictionary,
                                                        text_chunk = state["original_text"]
                                                        )
    
    translation = await llm.ainvoke(prompt)

    return {"translated_text":translation.content}

chunk_translation_graph = StateGraph(ChunkTranslation)
chunk_translation_graph.add_node("translate_chunk", translate_chunk)

chunk_translation_graph.add_edge(START, "translate_chunk")
chunk_translation_graph.add_edge("translate_chunk", END)

chunk_translation_app = chunk_translation_graph.compile()

In [6]:
class GlobalTranslation(TypedDict):
    texts: List[str]
    chunk_results: List[ChunkTranslation]

async def run_chunk_pipeline(state: GlobalTranslation, batch_size: int = 10) -> Dict[str, list]:
    all_results = []
    for batch in batch_list(state["texts"], batch_size):
        tasks = [chunk_translation_app.ainvoke({"dictionary":names_dictionary, "original_text": t}) for t in batch]
        results = await asyncio.gather(*tasks)
        all_results.extend(results)
    return {"chunk_results": all_results}

async def process_chunks(state: GlobalTranslation) -> Dict[str, list]:
    return await run_chunk_pipeline(state, batch_size=10)

# Build graph
global_translation_graph = StateGraph(GlobalTranslation)
global_translation_graph.add_node("process_chunks", process_chunks)

global_translation_graph.add_edge(START, "process_chunks")
global_translation_graph.add_edge("process_chunks", END)

global_translation_app = global_translation_graph.compile()

In [7]:
translation = await global_translation_app.ainvoke({"texts":chunks})

In [8]:
#Translated paragraphs are saved alongside the original for easier editing.

paragraphs = [

f"""Paragraph #{i}

Original:

{d["original_text"]}

Translation:

{d["translated_text"]}

==================================================
"""

for i, d in enumerate(translation["chunk_results"])
                      ]

text = "\n".join(paragraphs)

with open("raw_translation.txt", "w", encoding="utf-8") as file:
    file.write(text)