In [10]:
import os
import dotenv
dotenv.load_dotenv()
from typing import List, Tuple


#pptx transformation imports
import zipfile
import shutil
import warnings

#spellcheck imports
import xml.etree.ElementTree as ET
import re

# translator import
import xml.etree.ElementTree as ET
from openai import OpenAI
from pydantic import BaseModel
import json


In [11]:
def extract_pptx(pptx_path: str, extract_path: str) -> str:
    """Extract a PPTX file into its XML components."""
    os.makedirs(extract_path, exist_ok=True)
    
    with zipfile.ZipFile(pptx_path, 'r') as pptx:
        pptx.extractall(extract_path)
    
    # Get namespaces right after extraction
    return extract_path


def compose_pptx(self, source_path: str, output_pptx: str):
    """Compose a PPTX file from a directory containing the XML structure."""
    os.makedirs(os.path.dirname(output_pptx), exist_ok=True)
    
    with zipfile.ZipFile(output_pptx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(source_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_path)
                zf.write(file_path, arcname)

In [12]:
def extract_paragraph_elements_with_attributes(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    default_namespace = {
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
        'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',   
    }
    paragraphs = root.findall('.//a:p', default_namespace)
    all_paragraphs = []
    
    for paragraph in paragraphs:
        paragraph_data = {
            "paragraph_attributes": paragraph.attrib,
            "paragraph_properties": paragraph.find('.//a:pPr', default_namespace).attrib if paragraph.find('.//a:pPr', default_namespace) is not None else {},
            "runs": [],
            "full_text": "",
            "text_elements": []
        }
        
        # Look for runs within the paragraph
        runs = paragraph.findall('.//a:r', default_namespace)
        paragraph_text = []
        
        for run in runs:
            text_elements = run.findall('.//a:t', default_namespace)
            run_texts = [elem.text if elem is not None and elem.text is not None else "" for elem in text_elements]
            
            run_data = {
                "run_attributes": run.attrib,
                "run_properties": run.find('.//a:rPr', default_namespace).attrib if run.find('.//a:rPr', default_namespace) is not None else {},
                "text": run_texts,
                "text_as_string": f"[{' '.join(run_texts)}]"
            }
            paragraph_data["runs"].append(run_data)
            paragraph_text.extend(run_texts)
            paragraph_data["text_elements"].extend(run_texts)
        
        # Combine all text from the paragraph
        paragraph_data["full_text"] = " ".join(paragraph_text)
        all_paragraphs.append(paragraph_data)
    
    return all_paragraphs


In [13]:
#

In [14]:
def find_slide_files(root_folder: str) -> List[str]:
            """Find all slide XML files in the folder structure."""
            slide_files = []
            for root, _, files in os.walk(root_folder):
                for file in files:
                    if file.startswith('slide') and file.endswith('.xml'):
                        number_part = file[5:-4]
                        if number_part.isdigit():
                            slide_files.append(os.path.join(root, file))
            return sorted(slide_files)

def extract_paragraphs(xml_file: str, namespaces: dict) -> List[ET.Element]:
    """Extract everything inside paragraphs from the XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root.findall('.//a:p', namespaces)

#function to extract attributes from paragraph elements into readable text
def extract_information_from_paragraphs(paragraphs: List[ET.Element]):
    for paragraph in paragraphs:
        print(paragraph.attrib)



In [15]:
    def printing(paragraphs_data):
        for i, paragraph in enumerate(paragraphs_data, start=1):
            print(f"\nParagraph {i}:")
            print(f"Paragraph Attributes: {paragraph['paragraph_attributes']}")
            print(f"Paragraph Properties: {paragraph['paragraph_properties']}")
            print(f"Full Text: {paragraph['full_text']}")
            print(f"Individual Text Elements: {paragraph['text_elements']}")  # New print statement

            print("\nRuns in this paragraph:")
            for j, run in enumerate(paragraph['runs'], start=1):
                print(f"\n  Run {j}:")
                print(f"    Run Attributes: {run['run_attributes']}")
                print(f"    Run Properties: {run['run_properties']}")
                print(f"    Text Content: {run['text']}")
            print("\n" + "-"*50)

In [16]:
def printingXML(xml_file):
    paragraphs_data = extract_paragraph_elements_with_attributes(xml_file)
    for i, paragraph in enumerate(paragraphs_data, start=1):
        print(f"\nParagraph {i}:")
        print(f"Paragraph Attributes: {paragraph['paragraph_attributes']}")
        print(f"Paragraph Properties: {paragraph['paragraph_properties']}")
        print(f"Full Text: {paragraph['full_text']}")
        print(f"Individual Text Elements: {paragraph['text_elements']}")  # New print statement

        print("\nRuns in this paragraph:")
        for j, run in enumerate(paragraph['runs'], start=1):
            print(f"\n  Run {j}:")
            print(f"    Run Attributes: {run['run_attributes']}")
            print(f"    Run Properties: {run['run_properties']}")
            print(f"    Text Content: {run['text']}")
        print("\n" + "-"*50)

In [None]:
    paragraphs_data = extract_paragraph_elements_with_attributes(xml_file)


In [None]:
import xml.etree.ElementTree as ET
import json
import re

class PowerPointTextCorrector:
    def __init__(self, xml_slide, config_languages_path):
        self.slide = ET.fromstring(xml_slide) if isinstance(xml_slide, str) else xml_slide
        self.language_config = self.load_language_config(config_languages_path)

    @staticmethod
    def load_language_config(path):
        with open(path, 'r') as file:
            return json.load(file)

    def correct_slide_text(self):
        for run in self.slide.findall(".//a:r"):
            rPr = run.find("a:rPr")
            if rPr is not None and rPr.get("err") == "1":
                word = run.find("a:t").text
                lang = rPr.get("lang", "en-US")  # Assume default language if lang is missing
                
                # Rule #2: Language Mismatch Check
                if not self.is_language_correct(word, lang):
                    self.update_language(rPr, word)
                    rPr.attrib.pop("err", None)
                else:
                    # Rule #3: Handle Misspelled or Split/Merged Words
                    self.handle_misspelled_or_split_word(run)
        
        # Rule #4: Final Merging of Neighboring Runs
        self.merge_neighboring_runs()

    def is_language_correct(self, word, lang):
        # Check if the word's language matches the lang attribute in the XML
        word_lang = self.detect_language(word)
        return word_lang == lang

    def detect_language(self, word):
        # Detect language code for the word (placeholder for actual language detection logic)
        # Returns a language code string, e.g., 'en-US'
        # Here, assume we just return 'en-US' for simplicity
        return 'en-US'

    def update_language(self, rPr, word):
        correct_lang = self.detect_language(word)
        rPr.set("lang", correct_lang)

    def handle_misspelled_or_split_word(self, run):
        word = run.find("a:t").text
        prev_run, next_run = self.get_neighbor_runs(run)
        
        # Rule #3.2: Accidental Split Check
        if self.check_for_accidental_split(word, prev_run, next_run):
            # Rule #3.3: Property Consistency Check
            if self.properties_match(run, prev_run) or self.properties_match(run, next_run):
                # Rule #3.6: Merge the Runs
                self.merge_runs(run, prev_run, next_run)
            else:
                # Rule #3.5: Keep separate due to special formatting
                run.find("a:rPr").attrib.pop("err", None)
        else:
            # Rule #3.4: Correct the word if truly misspelled
            corrected_word = self.correct_spelling(word)
            run.find("a:t").text = corrected_word
            run.find("a:rPr").attrib.pop("err", None)

    def check_for_accidental_split(self, word, prev_run, next_run):
        # Check if merging with neighboring runs would correct the word
        if prev_run is not None and self.is_valid_merge(word, prev_run.find("a:t").text):
            return True
        if next_run is not None and self.is_valid_merge(word, next_run.find("a:t").text):
            return True
        return False

    def is_valid_merge(self, word, neighbor_word):
        # Combine words and check if the combined word is correctly spelled
        combined_word = word + neighbor_word
        return self.is_correct_spelling(combined_word)

    def properties_match(self, run1, run2):
        if run1 is None or run2 is None:
            return False
        rPr1, rPr2 = run1.find("a:rPr"), run2.find("a:rPr")
        if rPr1 is None or rPr2 is None:
            return False
        
        # Attributes to check
        required_attributes = ["sz", "b", "i", "u", "strike", "highlight", "latin"]
        optional_attributes = ["noProof", "dirty", "err"]

        for attr in required_attributes:
            if rPr1.get(attr) != rPr2.get(attr):
                return False

        for attr in optional_attributes:
            # If both are missing or both have the same value, treat as match
            if rPr1.get(attr, "0") != rPr2.get(attr, "0"):
                return False
        return True

    def merge_runs(self, run, prev_run, next_run):
        combined_text = (prev_run.find("a:t").text if prev_run else '') + run.find("a:t").text + (next_run.find("a:t").text if next_run else '')
        run.find("a:t").text = combined_text
        
        if prev_run is not None:
            run.getparent().remove(prev_run)
        if next_run is not None:
            run.getparent().remove(next_run)
        
        run.find("a:rPr").attrib.pop("err", None)

    def correct_spelling(self, word):
        # Correct the spelling of a word (placeholder for actual spell-check logic)
        return word  # Replace with a spell-checked version of the word

    def get_neighbor_runs(self, run):
        parent = run.getparent()
        run_index = list(parent).index(run)
        prev_run = parent[run_index - 1] if run_index > 0 else None
        next_run = parent[run_index + 1] if run_index < len(parent) - 1 else None
        return prev_run, next_run

    def merge_neighboring_runs(self):
        for paragraph in self.slide.findall(".//a:p"):
            runs = list(paragraph.findall("a:r"))
            for i in range(len(runs) - 1, 0, -1):
                if self.properties_match(runs[i], runs[i - 1]):
                    combined_text = runs[i - 1].find("a:t").text + runs[i].find("a:t").text
                    runs[i - 1].find("a:t").text = combined_text
                    paragraph.remove(runs[i])

    def is_correct_spelling(self, word):
        # Placeholder function for checking spelling correctness
        return True  # Assume all words are spelled correctly for simplicity

In [None]:
#1 check if error is in run makred by a:rPr err="1"
#2 check if the error is due to a missmatch in the a:rPr properties lang to the language of the word. 
#2.2 If True, adjust the lang to language of the word and remove the property error err="1" completely from that run. The language code can be found in the config_languages.json file.
#3 If 2.2 is false, the language is correct, but the word is misspelled. 
#3.2 (merging would make it correkt))check if the word would be correct if either the previouse or the following, or both runs texts would be merge into the word. This would mean, that the word was maybe splitt up by accident 
#3.3 (check if same properties) check if the runs which would make the word correct have the same porperties in a:rPr except for err="1" (consider #4)). Otherwise, maybe the word has a bold letter on pourpose, separating the text into several runs
#3.4 if 3.2 and 3.3 are false, then the word is misspelled and not due to accidental splitting. correct the word and remove the err="1" property from the run. do #4
#3.5 if 3.2 is true, but 3.3 is false, the word is spelled correctly, but has a e.g., bold letter on purpose, separating the text into several runs. 
#3.6 if 3.2 and 3.3 are true, the word was maybe splitted by accident, so merge the runs into one run with the correct word
#3.7 if 3.2 is false and 3.3 is true, then the word is spelled false. correct it and merge like #4

#4 # After correkting check if the neigbouring runs in the same paragraph have the same properties in a:rPr. Thereby, consider noProof="0" and dirty="0" if both are missing completely, as these are the default values. 
#4.2 If so, merge the runs into one run with the correct word or sentence
#4.3 If not, do nothing




In [19]:
if __name__ == "__main__":
    root_folder = "/Users/jwh/Code/Translator"
    pptx_name = "2024-10-23_ASML_Regulation_and_Governance_GenAI.pptx"
    openai_api_key = os.getenv("OPENAI_API_KEY")
    namespaces: dict={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}

    extract_pptx(root_folder+"/"+pptx_name, root_folder+"/extracted_pptx")
    slide_files = find_slide_files(root_folder+"/extracted_pptx")
  # Usage
    xml_file = slide_files[2]
    printingXML(xml_file)
    
    paragraphs_data = extract_paragraph_elements_with_attributes(xml_file)
    



Paragraph 1:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'l'}
Full Text: 
Individual Text Elements: []

Runs in this paragraph:

--------------------------------------------------

Paragraph 2:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'l'}
Full Text: 
Individual Text Elements: []

Runs in this paragraph:

--------------------------------------------------

Paragraph 3:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'ctr'}
Full Text: Regulatory, … 
Individual Text Elements: ['Regulatory, … ']

Runs in this paragraph:

  Run 1:
    Run Attributes: {}
    Run Properties: {'lang': 'en-GB', 'sz': '900'}
    Text Content: ['Regulatory, … ']

--------------------------------------------------

Paragraph 4:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'ctr'}
Full Text:  documents
Individual Text Elements: [' documents']

Runs in this paragraph:

  Run 1:
    Run Attributes: {}
    Run Properties: {'lang': 'en-GB', 'sz': '900'}
    Text Content: [

In [18]:
print(slide_files)

['/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide1.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide10.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide11.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide12.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide13.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide14.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide15.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide16.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide17.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide18.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide19.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide2.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide20.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/slides/slide21.xml', '/Users/jwh/Code/Translator/extracted_pptx/ppt/sl