# Imports

In [43]:
import os
import dotenv
dotenv.load_dotenv()
from typing import List, Tuple


#pptx transformation imports
import zipfile
import shutil
import warnings

# translator import
import xml.etree.ElementTree as ET
from openai import OpenAI
from pydantic import BaseModel
import json


# Transfrom pptx to xml

In [44]:
class PPTXTransformer:
    def __init__(self, extract_path: str):
        self.extract_path = extract_path
        self.namespaces = None

    def extract_pptx(self, pptx_path: str) -> str:
        """Extract a PPTX file into its XML components."""
        os.makedirs(self.extract_path, exist_ok=True)
        
        with zipfile.ZipFile(pptx_path, 'r') as pptx:
            pptx.extractall(self.extract_path)
        
        # Get namespaces right after extraction
        self.namespaces = self.get_namespace()
        return self.extract_path

    def get_namespace(self) -> dict:
        """Get the namespaces from the first slide XML using text processing."""
        slide_path = os.path.join(self.extract_path, 'ppt/slides/slide1.xml')
        
        try:
            with open(slide_path, 'r', encoding='utf-8') as file:
                content = file.read()
                
            # Find the root element opening tag
            start_idx = content.find('<p:sld')
            end_idx = content.find('>', start_idx)
            if start_idx == -1 or end_idx == -1:
                print("Could not find root element")
                return {}
            
            # Extract the root element declaration
            root_declaration = content[start_idx:end_idx]
            
            # Find all xmlns declarations
            namespaces = {}
            import re
            
            # Pattern to match xmlns:prefix="uri" or xmlns="uri"
            pattern = r'xmlns(?::([^=]+))?="([^"]+)"'
            matches = re.finditer(pattern, root_declaration)
            
            for match in matches:
                prefix = match.group(1)  # This might be None for default namespace
                uri = match.group(2)
                if prefix:
                    namespaces[prefix] = uri
                else:
                    namespaces['default'] = uri
            
            print("Extracted namespaces:", namespaces)
            return namespaces
            
        except Exception as e:
            print(f"Error extracting namespaces: {e}")
            return {}

    def compose_pptx(self, source_path: str, output_pptx: str):
        """Compose a PPTX file from a directory containing the XML structure."""
        os.makedirs(os.path.dirname(output_pptx), exist_ok=True)
        
        with zipfile.ZipFile(output_pptx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(source_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, source_path)
                    zf.write(file_path, arcname)

# Manipulate xml - Set Parent Class

In [45]:
class PowerpointPipeline:
    def __init__(self, 
                 model: str="gpt-4", 
                 pydentic_model: str="gpt-4-turbo-preview", 
                 client:str="OpenAI", 
                 namespaces: dict={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
                 ):
        #load config file
        with open("config.json", "r") as f:
            config = json.load(f)
        
        self.root_folder = config["root_folder"]
        self.pptx_name = config["pptx_name"]
        self.openai_api_key = config["openai_api_key"]

        self.model = model
        self.pydentic_model=pydentic_model
        self.client = client
        self.namespaces =namespaces 

        self.pptx_path = os.path.join(self.root_folder, self.pptx_name)
        self.extract_path = os.path.join(self.root_folder, 'extracted_pptx')
        self.output_folder = os.path.join(self.root_folder, 'translated_pptx')
        self.output_pptx_name = f'translated_{self.pptx_name}'
        
        if client == "OpenAI":
            self.client = OpenAI(api_key=self.openai_api_key)
        else:
            print("Client not supported (So far only OpenAI is supported)")

        # self.default_namespace =  {
        #     'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        #     'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
        #     'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',
        #     'a16': 'http://schemas.microsoft.com/office/drawing/2014/main',
        #     'p14': 'http://schemas.microsoft.com/office/powerpoint/2010/main',
        #     'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
        #     'v': 'urn:schemas-microsoft-com:vml'
        # }
        
    def find_slide_files(self, root_folder: str) -> List[str]:
            """Find all slide XML files in the folder structure."""
            slide_files = []
            for root, _, files in os.walk(root_folder):
                for file in files:
                    if file.startswith('slide') and file.endswith('.xml'):
                        number_part = file[5:-4]
                        if number_part.isdigit():
                            slide_files.append(os.path.join(root, file))
            return sorted(slide_files)

    def extract_text_runs(self, xml_file: str) -> Tuple[List[ET.Element], set]:
        """Extract text elements that need translation."""
        tree = ET.parse(xml_file)
        root = tree.getroot()
        text_elements = []
        original_text_elements = set()
  
        # Create a backup with the original text elements
        for paragraph in root.findall('.//a:p', self.namespaces):
            for run in paragraph.findall('.//a:r', self.namespaces):
                for original_text_element in run.findall('.//a:t', self.namespaces):
                    if original_text_element.text and original_text_element.text.strip():
                        original_text_elements.add(original_text_element.text.strip())

        # Process paragraphs while preserving structure
        for paragraph in root.findall('.//a:p', self.namespaces):
            text_parts = []
            for text_element in paragraph.findall('.//a:t', self.namespaces):
                if text_element.text and text_element.text.strip():
                    text_parts.append(text_element.text.strip())
            
            if text_parts:
                text_element = ET.Element('a:t')
                text_element.text = ' '.join(text_parts)
                text_elements.append(text_element)

        print("Text elements found:")
        for element in text_elements:
            print(f"- {element.text.strip()}")     
        return text_elements, original_text_elements

# Manipulate xml - Spellcheck

In [46]:
class SpellCheckResponse(BaseModel):
    spellcheck: str

class SlideSpellChecker(PowerpointPipeline):
    def __init__(self,
                 Further_SpellCheckInstructions: str):
        super().__init__()

        if Further_SpellCheckInstructions != "None":
            self.Further_SpellCheckInstructions = f" Here are some further wording style instructions: {self.Further_SpellCheckInstructions}"
        else:
            self.Further_SpellCheckInstructions = ""

        # self.Utilities_instance = Utilities()

# Manipulate xml - Translation

In [47]:
class TranslationResponse(BaseModel):
    translation: str

class SlideTranslator(PowerpointPipeline):
    def __init__(self, 
                 target_language: str,
                 Further_StyleInstructions: str = "None"): 

        super().__init__()

        self.target_language = target_language

        if Further_StyleInstructions != "None":
            self.Further_StyleInstructions = f" Here are some further wording style instructions: {self.Further_StyleInstructions}"
        else:
            self.Further_StyleInstructions = ""

    def translate_text(self, text: str) -> str:
        """Translate text while preserving approximate length and formatting."""
        prompt = f"""Translate following this instructions: Maintain similar total character length and preserve any special formatting or technical terms. For the translation do not return any other text than the pure translation.
        Translate the text to {self.target_language}.{self.Further_StyleInstructions} Text to translate: {text}
        """

        pydentic_prompt_addition = f"Respond with a JSON object containing only a 'translation' field with the {self.target_language} translation of this text"
        
        if self.model == "gpt-4": #non pydentic model
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a professional translator."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.3,
                )
                return response.choices[0].message.content.strip()

            except Exception as e:
                print(f"Translation error: {e}")
                return text
        else: #pydentic model   
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a professional translator."},
                        {"role": "user", "content": prompt + pydentic_prompt_addition}
                    ],
                    temperature=0.3,
                    response_format={ "type": "json_object" }
                )
                translation_response = TranslationResponse.model_validate_json(
                    response.choices[0].message.content
                )
                return translation_response.translation.strip()
    
            except Exception as e:
                if "Error code: 400" in str(e):
                    print(f"ERROR We use Pydentic, therefore the model must support json output (e.g. gpt-4-turbo-preview)| Translation error: {e}")
                else:
                    print(f"Translation error: {e}")    
                return text

    def create_translation_map(self, text_elements: List[ET.Element], original_text_elements: set) -> dict:
        """Create a mapping between original text and their translations."""
        translation_map = {text: "" for text in original_text_elements}
        
        for element in text_elements:
            original_text = element.text.strip()
            print(f"LLM fed text: {original_text}")
            if original_text:
                translated_text = self.translate_text(original_text)
                print(f"Original paragraph: {original_text}")
                print(f"Translated paragraph: {translated_text}\n")
                
                prompt = f"""Match each original text segment with its corresponding part from the translation.
                Original segments: {list(original_text_elements)}
                Full original text: {original_text}
                Full translation: {translated_text}
                
                Return a JSON object where keys are the original segments and values are their corresponding translations.
                Only include segments that appear in the original text."""
                
                try:
                    response = self.client.chat.completions.create(
                        model=self.pydentic_model,
                        messages=[
                            {"role": "system", "content": "You are a professional text alignment expert."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.3,
                        response_format={"type": "json_object"}
                    )
                    
                    segment_mappings = json.loads(response.choices[0].message.content)
                    
                    for orig_text, trans_text in segment_mappings.items():
                        if orig_text in translation_map:
                            translation_map[orig_text] = trans_text
                            
                except Exception as e:
                    print(f"Error matching segments: {e}")
        
        print(f"Translation map: {translation_map}")
        return translation_map

    def process_slides(self, folder_path: str):
        """Main function to process all slides in the presentation."""
        slide_files = self.find_slide_files(folder_path)
        
        for slide_file in slide_files:
            print(f"\nProcessing {os.path.basename(slide_file)}...")
            
            # Parse XML while preserving structure
            tree = ET.parse(slide_file)
            root = tree.getroot()
            
            # Extract namespaces from the root element
            namespaces = {}
            for key, value in root.attrib.items():
                if key.startswith('xmlns:'):
                    prefix = key.split(':')[1]
                    namespaces[prefix] = value
            
            # Extract and create translation mapping
            text_elements, original_text_elements = self.extract_text_runs(slide_file)
            translation_map = self.create_translation_map(text_elements, original_text_elements)
            
            # Update text while preserving XML structure and whitespace
            for original_text, translation in translation_map.items():
                for element in root.findall('.//a:t', self.namespaces):
                    if element.text and element.text.strip() == original_text:
                        # Preserve any leading/trailing whitespace from the original
                        leading_space = ''
                        trailing_space = ''
                        if element.text.startswith(' '):
                            leading_space = ' '
                        if element.text.endswith(' '):
                            trailing_space = ' '
                        element.text = leading_space + translation.strip() + trailing_space

            # Register extracted namespaces
            for prefix, uri in namespaces.items():
                ET.register_namespace(prefix, uri)
            
            # Register our known namespaces
            for prefix, uri in self.namespaces.items():
                ET.register_namespace(prefix, uri)
            
            # Write back XML while preserving declaration and namespaces
            with open(slide_file, 'wb') as f:
                tree.write(f, encoding='UTF-8', xml_declaration=True)

# Pipeline

In [48]:
class PowerPointTranslator(PowerpointPipeline):
    def __init__(self, target_language:str, Further_StyleInstructions:str="None", Further_SpellCheckInstructions:str="None"):
        super().__init__()
        
        # Initialize transformer and translator
        self.transformer = PPTXTransformer(self.extract_path)
        self.spellchecker = SlideSpellChecker(Further_SpellCheckInstructions)
        self.translator = SlideTranslator(target_language, Further_StyleInstructions)

    def translate_presentation(self):
        """Main method to handle the full translation process"""
        try:
            # Extract PPTX
            self.transformer.extract_pptx(self.pptx_path)
            
            #Get namespaces
            namespaces = self.transformer.get_namespace()
            self.translator.namespaces = namespaces
            
            # Process slides
            self.translator.process_slides(self.extract_path)
            
            # Compose final PPTX
            output_path = os.path.join(self.output_folder, self.output_pptx_name)
            self.transformer.compose_pptx(self.extract_path, output_path)
            
            return True
            
        except Exception as e:
            print(f"Error translating presentation: {e}")
            return False


# Main


In [49]:
# Example usage:
if __name__ == "__main__":
    root_folder = "/Users/jwh/Code/Translator"
    pptx_name = "2024-10-23_ASML_Regulation_and_Governance_GenAI.pptx"
    openai_api_key = os.getenv("OPENAI_API_KEY")

    #write root folder, pptx, and openai api key into a config file
    with open("config.json", "w") as f:
        json.dump({"root_folder": root_folder, "pptx_name": pptx_name, "openai_api_key": openai_api_key}, f)

    PowerpointPipeline_instance = PowerpointPipeline()
    translator = PowerPointTranslator(
        target_language="German",
        Further_StyleInstructions="None"
    )
    success = translator.translate_presentation()
    print(f"Translation completed successfully. File saved to: {translator.output_folder}/{translator.output_pptx_name}") if success else print("Translation failed")


Extracted namespaces: {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'}
Extracted namespaces: {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'}

Processing slide1.xml...
Text elements found:
- Delivering tangible results for ASML with Data & AI
- ASML x Eraneos Analytics
- October 2024
- Regulation and Governance AI at ASML
LLM fed text: Delivering tangible results for ASML with Data & AI
Original paragraph: Delivering tangible results for ASML with Data & AI
Translated paragraph: Erzielung greifbarer Ergebnisse für ASML mit Daten & KI

LLM fed text: ASML x Eraneos Analytics
Original paragraph: ASML x Eraneos Analytics
Translated paragraph: ASML x Eraneos Analytik

LLM fed text: 

KeyboardInterrupt: 