In [126]:
import os
import dotenv
dotenv.load_dotenv()
from typing import List, Tuple


#pptx transformation imports
import zipfile
import shutil
import warnings

#spellcheck imports
import xml.etree.ElementTree as ET
import re

# translator import
import xml.etree.ElementTree as ET
from openai import OpenAI
from pydantic import BaseModel
import json


In [127]:
def extract_pptx(pptx_path: str, extract_path: str) -> str:
    """Extract a PPTX file into its XML components."""
    os.makedirs(extract_path, exist_ok=True)
    
    with zipfile.ZipFile(pptx_path, 'r') as pptx:
        pptx.extractall(extract_path)
    
    # Get namespaces right after extraction
    return extract_path


def compose_pptx(self, source_path: str, output_pptx: str):
    """Compose a PPTX file from a directory containing the XML structure."""
    os.makedirs(os.path.dirname(output_pptx), exist_ok=True)
    
    with zipfile.ZipFile(output_pptx, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(source_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_path)
                zf.write(file_path, arcname)

In [128]:
def extract_paragraph_elements_with_attributes(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    default_namespace = {
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
        'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',   
    }
    paragraphs = root.findall('.//a:p', default_namespace)
    all_paragraphs = []
    
    for paragraph in paragraphs:
        paragraph_data = {
            "paragraph_attributes": paragraph.attrib,
            "paragraph_properties": paragraph.find('.//a:pPr', default_namespace).attrib if paragraph.find('.//a:pPr', default_namespace) is not None else {},
            "runs": [],
            "full_text": "",
            "text_elements": []
        }
        
        # Look for runs within the paragraph
        runs = paragraph.findall('.//a:r', default_namespace)
        paragraph_text = []
        
        for run in runs:
            text_elements = run.findall('.//a:t', default_namespace)
            run_texts = [elem.text if elem is not None and elem.text is not None else "" for elem in text_elements]
            
            run_data = {
                "run_attributes": run.attrib,
                "run_properties": run.find('.//a:rPr', default_namespace).attrib if run.find('.//a:rPr', default_namespace) is not None else {},
                "text": run_texts,
                "text_as_string": f"[{' '.join(run_texts)}]"
            }
            paragraph_data["runs"].append(run_data)
            paragraph_text.extend(run_texts)
            paragraph_data["text_elements"].extend(run_texts)
        
        # Combine all text from the paragraph
        paragraph_data["full_text"] = " ".join(paragraph_text)
        all_paragraphs.append(paragraph_data)
    
    return all_paragraphs


In [129]:
def find_slide_files(root_folder: str) -> List[str]:
            """Find all slide XML files in the folder structure."""
            slide_files = []
            for root, _, files in os.walk(root_folder):
                for file in files:
                    if file.startswith('slide') and file.endswith('.xml'):
                        number_part = file[5:-4]
                        if number_part.isdigit():
                            slide_files.append(os.path.join(root, file))
            return sorted(slide_files)

def extract_paragraphs(xml_file: str, namespaces: dict) -> List[ET.Element]:
    """Extract everything inside paragraphs from the XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root.findall('.//a:p', namespaces)

#function to extract attributes from paragraph elements into readable text
def extract_information_from_paragraphs(paragraphs: List[ET.Element]):
    for paragraph in paragraphs:
        print(paragraph.attrib)



In [130]:
if __name__ == "__main__":
    root_folder = "/Users/jwh/Code/Translator"
    pptx_name = "2024-10-23_ASML_Regulation_and_Governance_GenAI.pptx"
    openai_api_key = os.getenv("OPENAI_API_KEY")
    namespaces: dict={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}

    extract_pptx(root_folder+"/"+pptx_name, root_folder+"/extracted_pptx")
    slide_files = find_slide_files(root_folder+"/extracted_pptx")
  # Usage
    xml_file = slide_files[2]
    paragraphs_data = extract_paragraph_elements_with_attributes(xml_file)
    for i, paragraph in enumerate(paragraphs_data, start=1):
        print(f"\nParagraph {i}:")
        print(f"Paragraph Attributes: {paragraph['paragraph_attributes']}")
        print(f"Paragraph Properties: {paragraph['paragraph_properties']}")
        print(f"Full Text: {paragraph['full_text']}")
        print(f"Individual Text Elements: {paragraph['text_elements']}")  # New print statement

        print("\nRuns in this paragraph:")
        for j, run in enumerate(paragraph['runs'], start=1):
            print(f"\n  Run {j}:")
            print(f"    Run Attributes: {run['run_attributes']}")
            print(f"    Run Properties: {run['run_properties']}")
            print(f"    Text Content: {run['text']}")
        print("\n" + "-"*50)


Paragraph 1:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'l'}
Full Text: 
Individual Text Elements: []

Runs in this paragraph:

--------------------------------------------------

Paragraph 2:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'l'}
Full Text: 
Individual Text Elements: []

Runs in this paragraph:

--------------------------------------------------

Paragraph 3:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'ctr'}
Full Text: Regulatory, … 
Individual Text Elements: ['Regulatory, … ']

Runs in this paragraph:

  Run 1:
    Run Attributes: {}
    Run Properties: {'lang': 'en-GB', 'sz': '900'}
    Text Content: ['Regulatory, … ']

--------------------------------------------------

Paragraph 4:
Paragraph Attributes: {}
Paragraph Properties: {'algn': 'ctr'}
Full Text:  documents
Individual Text Elements: [' documents']

Runs in this paragraph:

  Run 1:
    Run Attributes: {}
    Run Properties: {'lang': 'en-GB', 'sz': '900'}
    Text Content: [