## This notebook contain work for the ecoSPECS challenge at European Summer of Code 2025. 

### Making use of an NLP Model, MarianMTModel from the HuggingFace model hub, i translated the trial datasets, which consist of nine(9) .docx file into english, while i know this does not partend to the task given, i just did this in other to grasp the larger context of what the project is and also found it to be a valuable step in grasping the intent behind each document. This deeper comprehension helped me appreciate the structure, language, and goals of the specification and documentation process, which in turn informed my approach to the actual challenge.

In [None]:
from transformers import MarianMTModel, MarianTokenizer
from docx.oxml.text.paragraph import CT_P
from docx.text.paragraph import Paragraph
from docx import Document
import spacy
import os
from copy import deepcopy
from docx.oxml.table import CT_Tbl
from docx.table import Table


nlp = spacy.load("de_core_news_sm")

model_name = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_german_to_english(text):

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    english_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return english_text

def translate_text_sentence_based(text):

    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    translated_sentences = []
    for sentence in sentences:
        if sentence:
            translated = translate_german_to_english(sentence)
            translated_sentences.append(translated)
    return " ".join(translated_sentences)

def translate_paragraph_with_formatting(paragraph, translated_doc):

    new_para = translated_doc.add_paragraph()
    new_para.style = paragraph.style

    for run in paragraph.runs:
        run_text = run.text.strip()
        if run_text:
            translated_text = translate_text_sentence_based(run_text)
            new_run = new_para.add_run(translated_text)

            new_run.bold = run.bold
            new_run.italic = run.italic
            new_run.underline = run.underline
            new_run.font.name = run.font.name
            new_run.font.size = run.font.size

def translate_docx_file_preserve_styles(input_path, output_path):
    doc = Document(input_path)
    translated_doc = Document()

    if translated_doc.paragraphs:
        p = translated_doc.paragraphs[0]._element
        translated_doc._element.body.remove(p)

    for element in doc.element.body:
        if isinstance(element, CT_P):  
            para = Paragraph(element, doc)
            if para.text.strip():
                translate_paragraph_with_formatting(para, translated_doc)
            else:
                translated_doc.add_paragraph("")

        elif isinstance(element, CT_Tbl): 
            table = Table(element, doc)

            
            new_table = translated_doc.add_table(rows=len(table.rows), cols=len(table.columns))
            new_table.style = table.style

            for i, row in enumerate(table.rows):
                for j, cell in enumerate(row.cells):
                    german_text = cell.text.strip()
                    translated_text = translate_text_sentence_based(german_text) if german_text else ""
                    target_cell = new_table.cell(i, j)
                    
                    
                    for p in target_cell.paragraphs:
                        p.clear()
                    target_cell.text = translated_text

    translated_doc.save(output_path)
    print(f"Translation completed! Saved to: {output_path}")


def translate_multiple_docx_files(input_folder, output_folder):

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".docx"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, f"{filename[:-5]}_translated.docx")
            print(f"Translating: {filename}")
            translate_docx_file_preserve_styles(input_path, output_path)



In [None]:

translate_multiple_docx_files("data", "data/translated_docs")
