# Imports

In [315]:
import os
import dotenv
dotenv.load_dotenv()
from typing import List, Tuple


#pptx transformation imports
import zipfile
import shutil
import warnings

#spellcheck imports
from spellchecker import SpellChecker
import xml.etree.ElementTree as ET
import re

# translator import
import xml.etree.ElementTree as ET
from openai import OpenAI
from pydantic import BaseModel
import json
from langdetect import detect


# Set Parent Class

In [316]:
class PowerpointPipeline:
    def __init__(self, 
                 model: str="gpt-4", 
                 pydentic_model: str="gpt-4-turbo-preview", 
                 client:str="OpenAI", 
                 verbose: bool=False,
                 extract_namespaces: bool=False,
                 namespaces: dict={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
                                   'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
                                   'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',
                                   'p14':"http://schemas.microsoft.com/office/powerpoint/2010/main",
                                   'a16':"http://schemas.microsoft.com/office/drawing/2014/main",
                                   'mc':"http://schemas.openxmlformats.org/markup-compatibility/2006",
                                   'v':"urn:schemas-microsoft-com:vml"
                                   },
                 ):
        #load config file
        with open("config.json", "r") as f:
            config = json.load(f)
        
        self.root_folder = config["root_folder"]
        self.pptx_folder = config["pptx_folder"]
        self.pptx_name = config["pptx_name"]
        self.extract_folder = config["extract_folder"]
        self.output_folder = config["output_folder"]


        self.openai_api_key = os.getenv("OPENAI_API_KEY")
        self.model = model
        self.pydentic_model=pydentic_model
        self.client = client
        self.extract_namespaces = extract_namespaces
        self.namespaces =namespaces 

        self.pptx_path = os.path.join(self.root_folder, self.pptx_folder, self.pptx_name)
        if verbose: print(f"\tPPTX path: {self.pptx_path}")
        self.extract_path = os.path.join(self.root_folder, self.extract_folder)
        if verbose: print(f"\tExtract path: {self.extract_path}")
        self.output_folder = os.path.join(self.root_folder, self.output_folder)
        if verbose: print(f"\tOutput folder: {self.output_folder}")
        self.output_pptx_name = f'translated_{self.pptx_name}'
        if verbose: print(f"\tOutput PPTX name: {self.output_pptx_name}")
        
        if client == "OpenAI":
            self.client = OpenAI(api_key=self.openai_api_key)
        else:
            print("\tClient not supported (So far only OpenAI is supported)")
      
    def find_slide_files(self, root_folder: str) -> List[str]:
            """Find all slide XML files in the folder structure."""
            slide_files = []
            for root, _, files in os.walk(root_folder):
                for file in files:
                    if file.startswith('slide') and file.endswith('.xml'):
                        number_part = file[5:-4]
                        if number_part.isdigit():
                            slide_files.append(os.path.join(root, file))
            return sorted(slide_files)
    
    def extract_paragraphs(self, xml_file: str) -> List[ET.Element]:
        """Extract everything inparagraphs from the XML file."""
        tree = ET.parse(xml_file)
        root = tree.getroot()
        return root.findall('.//a:p', self.namespaces)

    def extract_text_runs(self, xml_file: str) -> Tuple[List[ET.Element], set]:
        """Extract text elements that need translation."""
        tree = ET.parse(xml_file)
        root = tree.getroot()
        text_elements = []
        original_text_elements = set()
  
        # Create a backup with the original text elements
        for paragraph in root.findall('.//a:p', self.namespaces):
            for run in paragraph.findall('.//a:r', self.namespaces):
                run_props = run.find('.//a:rPr', self.namespaces)
                lang = run_props.get('lang') if run_props is not None else 'en-GB' 

                for original_text_element in run.findall('.//a:t', self.namespaces):
                    if original_text_element.text and original_text_element.text.strip():
                        original_text_elements.add(original_text_element.text.strip())

        # Process paragraphs while preserving structure
        for paragraph in root.findall('.//a:p', self.namespaces):
            text_parts = []
            lang = None
            for text_element in paragraph.findall('.//a:t', self.namespaces):
                run_props = text_element.find('.//a:rPr', self.namespaces)
                if run_props is not None:
                    lang = run_props.get('lang', 'en-GB')
                if text_element.text and text_element.text.strip():
                    text_parts.append(text_element.text.strip())
            
            if text_parts:
                text_element = ET.Element('a:t')
                text_element.text = ' '.join(text_parts)
                text_element.set('lang', lang or 'en-GB')
                text_elements.append(text_element)

        print("Text elements found:")
        for element in text_elements:
            print(f"- {element.text.strip()} | lang: {element.get('lang')}")     
        return text_elements, original_text_elements

# Transfrom pptx to xml

In [317]:
class PPTXTransformer(PowerpointPipeline):
    def __init__(self, extract_path: str):
        self.extract_path = extract_path
        super().__init__()

    def extract_pptx(self, pptx_path: str) -> str:
        """Extract a PPTX file into its XML components."""
        os.makedirs(self.extract_path, exist_ok=True)
        
        with zipfile.ZipFile(pptx_path, 'r') as pptx:
            pptx.extractall(self.extract_path)
        
        # Get namespaces right after extraction
        if self.extract_namespaces:
            self.namespaces = self.get_namespace()
        return self.extract_path

    def get_namespace(self) -> dict:
        """Get the namespaces from the first slide XML using text processing."""
        slide_path = os.path.join(self.extract_path, 'ppt/slides/slide1.xml')
        
        try:
            with open(slide_path, 'r', encoding='utf-8') as file:
                content = file.read()
                
            # Find the root element opening tag
            start_idx = content.find('<p:sld')
            end_idx = content.find('>', start_idx)
            if start_idx == -1 or end_idx == -1:
                print("Could not find root element")
                return {}
            
            # Extract the root element declaration
            root_declaration = content[start_idx:end_idx]
            
            # Find all xmlns declarations
            namespaces = {}
            import re
            
            # Pattern to match xmlns:prefix="uri" or xmlns="uri"
            pattern = r'xmlns(?::([^=]+))?="([^"]+)"'
            matches = re.finditer(pattern, root_declaration)
            
            for match in matches:
                prefix = match.group(1)  # This might be None for default namespace
                uri = match.group(2)
                if prefix:
                    namespaces[prefix] = uri
                else:
                    namespaces['default'] = uri
            
            print("\tExtracted namespaces:", namespaces)
            return namespaces
            
        except Exception as e:
            print(f"\tError extracting namespaces: {e}")
            return {}

    def compose_pptx(self, source_path: str, output_pptx: str):
        """Compose a PPTX file from a directory containing the XML structure."""
        os.makedirs(os.path.dirname(output_pptx), exist_ok=True)
        
        with zipfile.ZipFile(output_pptx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(source_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, source_path)
                    zf.write(file_path, arcname)

# Manipulate xml - Spellcheck

In [318]:
class SlideSpellChecker(PowerpointPipeline):
    def __init__(self,Further_SpellCheckInstructions):
        super().__init__()
        
        self.spell = SpellChecker()
        # Define namespaces used in PPTX XML
        self.namespaces = {
            'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
            'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'
        }
        ET.register_namespace('a', self.namespaces['a'])
        ET.register_namespace('p', self.namespaces['p'])

    def check_and_fix_slide(self, xml_content):
        tree = ET.ElementTree(ET.fromstring(xml_content))
        root = tree.getroot()
        
        # Find all paragraphs
        for paragraph in root.findall('.//a:p', self.namespaces):
            self._process_paragraph(paragraph)
            
        return ET.tostring(root, encoding='unicode')

    def _process_paragraph(self, paragraph):
        runs = paragraph.findall('a:r', self.namespaces)
        i = 0
        while i < len(runs):
            current_run = runs[i]
            
            # Check if current run has error attribute
            if current_run.get('err') == '1':
                # Store original properties
                run_props = current_run.find('a:rPr', self.namespaces)
                
                # Collect text from this and adjacent runs
                combined_text = self._collect_adjacent_text(runs, i)
                
                # Fix spelling and update runs
                corrected_text = self._fix_spelling(combined_text)
                if corrected_text != combined_text:
                    self._update_runs_with_correction(runs, i, corrected_text, run_props)
                    
                # Merge runs with identical properties
                self._merge_identical_runs(paragraph)
            
            # Check language consistency
            self._check_language_consistency(current_run)
            
            i += 1

    def _collect_adjacent_text(self, runs, start_index):
        """Collects text from adjacent runs that might be part of the same word"""
        text_parts = []
        i = start_index
        
        while i < len(runs):
            text_elem = runs[i].find('a:t', self.namespaces)
            if text_elem is not None:
                text_parts.append(text_elem.text)
            i += 1
            
            # Stop if we hit punctuation or clear word boundary
            if text_elem is not None and re.search(r'[.!?,\s]$', text_elem.text):
                break
                
        return ''.join(text_parts)

    def _fix_spelling(self, text):
        words = text.split()
        corrected_words = []
        
        for word in words:
            if not self.spell.correction(word) == word:
                corrected_words.append(self.spell.correction(word))
            else:
                corrected_words.append(word)
                
        return ' '.join(corrected_words)

    def _update_runs_with_correction(self, runs, start_index, corrected_text, original_props):
        """Updates the runs with corrected text while maintaining formatting"""
        # Create new run with corrected text
        new_run = ET.Element('a:r')
        new_run.append(original_props)
        
        text_elem = ET.SubElement(new_run, 'a:t')
        text_elem.text = corrected_text
        
        # Replace old runs with new corrected run
        parent = runs[start_index].getparent()
        parent.remove(runs[start_index])
        parent.insert(start_index, new_run)

    def _merge_identical_runs(self, paragraph):
        """Merges adjacent runs with identical properties"""
        runs = paragraph.findall('a:r', self.namespaces)
        i = 0
        
        while i < len(runs) - 1:
            current_run = runs[i]
            next_run = runs[i + 1]
            
            if self._runs_have_identical_props(current_run, next_run):
                # Merge text content
                current_text = current_run.find('a:t', self.namespaces).text
                next_text = next_run.find('a:t', self.namespaces).text
                current_run.find('a:t', self.namespaces).text = current_text + next_text
                
                # Remove the merged run
                paragraph.remove(next_run)
                runs = paragraph.findall('a:r', self.namespaces)
            else:
                i += 1

    def _runs_have_identical_props(self, run1, run2):
        """Checks if two runs have identical properties"""
        props1 = run1.find('a:rPr', self.namespaces)
        props2 = run2.find('a:rPr', self.namespaces)
        
        if props1 is None or props2 is None:
            return False
            
        # Compare relevant attributes
        attrs_to_compare = ['lang', 'sz', 'b', 'i', 'u']
        return all(props1.get(attr) == props2.get(attr) for attr in attrs_to_compare)

    def _check_language_consistency(self, run):
        """Checks and fixes language consistency within a run"""
        run_props = run.find('a:rPr', self.namespaces)
        if run_props is not None and run_props.get('lang') is None:
            # Set default language if missing
            run_props.set('lang', 'en-US')

# Manipulate xml - Translation

In [319]:
class TranslationResponse(BaseModel):
    translation: str

class SlideTranslator(PowerpointPipeline):
    def __init__(self, 
                 target_language: str,
                 Further_StyleInstructions: str = "None"): 

        super().__init__()

        self.target_language = target_language
        # Load language codes mapping
        with open("config_languages.json", "r") as f:
            self.language_codes = json.load(f)

        if Further_StyleInstructions != "None":
            self.Further_StyleInstructions = f" Here are some further wording style instructions: {self.Further_StyleInstructions}"
        else:
            self.Further_StyleInstructions = ""

    def translate_text(self, text: str) -> str:
        """Translate text while preserving approximate length and formatting."""
        prompt = f"""Translate following this instructions: 
        Maintain similar total character length and preserve any special formatting or technical terms. 
        IMPORTANT:For the translation you must not return any other text than the pure translation.
        Keep technical terms in the translation. 
        Keep role names in the translation (e.g., DataScientist, CEO, etc.).
        Keep names of companies in the translation (e.g., Apple, Microsoft, etc.).
        Keep names of products in the translation (e.g., iPhone, Windows, LegalAI, etc.).
        Make the translation sharp, concise and business-like.
        Translate the text to {self.target_language}.
        {self.Further_StyleInstructions}
        IMPORTANT:For the translation you must not return any other text than the pure translation.
        Text to translate: {text}
        """

        pydentic_prompt_addition = f"Respond with a JSON object containing only a 'translation' field with the {self.target_language} translation of this text"
        
        if self.model == "gpt-4": #non pydentic model
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a professional translator."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.3,
                )
                return response.choices[0].message.content.strip()

            except Exception as e:
                print(f"Translation error: {e}")
                return text
        else: #pydentic model   
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a professional translator."},
                        {"role": "user", "content": prompt + pydentic_prompt_addition}
                    ],
                    tools=[],
                    temperature=0.3,
                    response_format={ "type": "json_object" }
                )
                translation_response = TranslationResponse.model_validate_json(
                    response.choices[0].message.content
                )
                return translation_response.translation.strip()
    
            except Exception as e:
                if "Error code: 400" in str(e):
                    print(f"ERROR We use Pydentic, therefore the model must support json output (e.g. gpt-4-turbo-preview)| Translation error: {e}")
                else:
                    print(f"Translation error: {e}")    
                return text

    def create_translation_map(self, text_elements: List[ET.Element], original_text_elements: set) -> dict:
        """Create a mapping between original text and their translations."""
        translation_map = {text: "" for text in original_text_elements}
        
        for element in text_elements:
            original_text = element.text.strip()
            source_lang = element.get('lang', 'en-GB')
            print(f"\tLLM fed text: {original_text}")
            if original_text:
                translated_text = self.translate_text(original_text)
                print(f"\tOriginal paragraph: {original_text}")
                print(f"\tTranslated paragraph: {translated_text}\n")
                
                prompt = f"""Match each original text segment with its corresponding part from the translation.
                Original segments: {[text for text in original_text_elements]}
                Full original text: {original_text}
                Full translation: {translated_text}
                
                Return a JSON object where keys are the original segments and values are their corresponding translations.
                Only include segments that appear in the original text."""
                
                try:
                    response = self.client.chat.completions.create(
                        model=self.pydentic_model,
                        messages=[
                            {"role": "system", "content": "You are a professional text alignment expert."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.3,
                        response_format={"type": "json_object"}
                    )
                    
                    segment_mappings = json.loads(response.choices[0].message.content)
                    
                    for orig_text, trans_text in segment_mappings.items():
                        if orig_text in translation_map:
                            translation_map[orig_text] = trans_text
                            
                except Exception as e:
                    print(f"\tError matching segments: {e}")
        
        print(f"\tTranslation map: {translation_map}")
        return translation_map

    def detect_pptx_language(self, text: str) -> str:
        """Detect language and return PowerPoint language code."""
        try:
            # Detect language
            detected_lang = detect(text)
            # Convert to PowerPoint language code
            pptx_lang = self.language_codes.get(detected_lang, "en-US")  # default to en-US if not found
            return pptx_lang
        except Exception as e:
            print(f"\tLanguage detection error: {e}")
            return "en-US"  # default to en-US on error    

    def process_slides(self, folder_path: str):
        """Main function to process all slides in the presentation."""
        slide_files = self.find_slide_files(folder_path)
        
        for slide_file in slide_files:
            print(f"\nProcessing {os.path.basename(slide_file)}...")
            print(f"Processing slide {slide_files.index(slide_file) + 1} of {len(slide_files)}...")
            
            # Parse XML while preserving structure
            tree = ET.parse(slide_file)
            root = tree.getroot()
            
            # Extract namespaces from the root element
            namespaces = {}
            for key, value in root.attrib.items():
                if key.startswith('xmlns:'):
                    prefix = key.split(':')[1]
                    namespaces[prefix] = value
            
            # Extract and create translation mapping
            text_elements, original_text_elements = self.extract_text_runs(slide_file)
            translation_map = self.create_translation_map(text_elements, original_text_elements)
            
            # Update text while preserving XML structure and whitespace
            for original_text, translation in translation_map.items():
                if not translation.strip():  # Skip empty translations
                    continue
                #Update Text
                for element in root.findall('.//a:t', self.namespaces):
                    if element.text and element.text.strip() == original_text:
                        if translation.strip():  # If we have a valid translation
                            # Preserve any leading/trailing whitespace from the original
                            leading_space = ''
                            trailing_space = ''
                            if element.text.startswith(' '):
                                leading_space = ' '
                            if element.text.endswith(' '):
                                trailing_space = ' '
                            # Update text
                            element.text = leading_space + translation.strip() + trailing_space

                        else:
                            # Find the parent run ('a:r') element and remove it
                            parent_run = element.getparent()
                            if parent_run is not None:
                                parent_paragraph = parent_run.getparent()
                                if parent_paragraph is not None:
                                    parent_paragraph.remove(parent_run)

                # Detect and update language
                for run in root.findall('.//a:r', self.namespaces):
                    text_elem = run.find('a:t', self.namespaces)
                    if text_elem is not None:
                        detected_lang = self.detect_pptx_language(text_elem.text.strip())
                        # Find and update the language attribute in the corresponding rPr element
                        parent_run = text_elem.getparent()
                        if parent_run is not None:
                            rPr = parent_run.find('a:rPr', self.namespaces)
                        if rPr is not None:
                                rPr.set('lang', detected_lang)
                                print(f"\tUpdated language for '{translation.strip()}' to {detected_lang}")
                        # else:
                        #     # Create rPr element if it doesn't exist
                        #     rPr = ET.SubElement(run, f"{{{self.namespaces['a']}}}rPr")
                        #     rPr.set('lang', detected_lang)
                        #     print(f"created property language for '{translation.strip()}' to {detected_lang}")                                    

            # Register extracted namespaces
            for prefix, uri in namespaces.items():
                ET.register_namespace(prefix, uri)
            
            # Register our known namespaces
            for prefix, uri in self.namespaces.items():
                ET.register_namespace(prefix, uri)
            
            # Write back XML while preserving declaration and namespaces
            with open(slide_file, 'wb') as f:
                tree.write(f, encoding='UTF-8', xml_declaration=True)

# Pipelines

## Pipeline - Ppt to xml

In [320]:
class XMLcreator(PowerpointPipeline):
    def __init__(self, verbose: bool=False):
        super().__init__(verbose=verbose)
        # Initialize transformer and translator
        self.transformer = PPTXTransformer(self.extract_path)

    def extract_pptx(self):
        """Main method to handle the full translation process"""
        try:
            # Extract PPTX
            self.transformer.extract_pptx(self.pptx_path)
            
            return True
            
        except Exception as e:
            print(f"Error translating presentation: {e}")
            return False


## Pipieline - Translator

In [321]:
class PowerPointTranslator(PowerpointPipeline):
    def __init__(self, target_language:str, Further_StyleInstructions:str="None", Further_SpellCheckInstructions:str="None"):
        super().__init__()
        
        # Initialize transformer and translator
        self.transformer = PPTXTransformer(self.extract_path)
        self.spellchecker = SlideSpellChecker(Further_SpellCheckInstructions)
        self.translator = SlideTranslator(target_language, Further_StyleInstructions)

    def translate_presentation(self):
        """Main method to handle the full translation process"""
        try:
            # Extract PPTX
            self.transformer.extract_pptx(self.pptx_path)
            
            #Get namespaces
            namespaces = self.transformer.get_namespace()
            self.translator.namespaces = namespaces
            
            # Process slides
            self.translator.process_slides(self.extract_path)
            
            # Compose final PPTX
            output_path = os.path.join(self.output_folder, self.output_pptx_name)
            self.transformer.compose_pptx(self.extract_path, output_path)
            
            return True
            
        except Exception as e:
            print(f"Error translating presentation: {e}")
            return False


# Main


In [322]:
if __name__ == "__main__":
    root_folder = "/Users/jwh/Code/SlideMob"
    pptx_folder = "Testpptx"
    pptx_name = "presentation1.pptx"
    extract_folder = "extracted_pptx"
    outputfolder = "output"

    print("... writing config")
    with open("config.json", "w") as f:
        json.dump({"root_folder": root_folder, "pptx_folder": pptx_folder, "pptx_name": pptx_name, "extract_folder": extract_folder, "output_folder": outputfolder}, f)

    print("... Creating PowerpointPipeline instance ")
    PowerpointPipeline_instance = PowerpointPipeline()

    extract = True
    spellcheck = False
    improve = False
    translate = True
    align = False
    consistency = False

    if extract:
        print("... Creating XMLcreator instance")
        XMLcreator_instance = XMLcreator(verbose=True)
        print("... Running extract_pptx on:")
        success = XMLcreator_instance.extract_pptx()

    if translate:
        print("... Creating PowerPointTranslator instance")
        translator = PowerPointTranslator(
            target_language="German",
            Further_StyleInstructions="None",
        )
        success = translator.translate_presentation()


        print(f"Translation completed successfully. File saved to: {translator.output_folder}/{translator.output_pptx_name}") if success else print("Translation failed")


... writing config
... Creating PowerpointPipeline instance 
... Creating XMLcreator instance
	PPTX path: /Users/jwh/Code/SlideMob/Testpptx/presentation1.pptx
	Extract path: /Users/jwh/Code/SlideMob/extracted_pptx
	Output folder: /Users/jwh/Code/SlideMob/output
	Output PPTX name: translated_presentation1.pptx
... Running extract_pptx on:
... Creating PowerPointTranslator instance
	Extracted namespaces: {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'}

Processing slide1.xml...
Processing slide 1 of 3...
Text elements found:
- This is the presentation title | lang: en-GB
- Second textblock | lang: en-GB
	LLM fed text: This is the presentation title
	Original paragraph: This is the presentation title
	Translated paragraph: Dies ist der Präsentationstitel

	LLM fed text: Second textblock
	Original paragraph: Second textblock
	Translate