In [1]:
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

def parse_story_files(story_folder):
    story_text_map = {}
    for file in os.listdir(story_folder):
        if file.startswith("Story_") and file.endswith(".xml"):
            tree = ET.parse(os.path.join(story_folder, file))
            root = tree.getroot()
            story_id = root.attrib.get("Self")
            texts = []
            for content in root.iter():
                if content.tag.endswith("Content"):
                    texts.append(content.text)
            if texts:
                story_text_map[story_id] = "\n".join(filter(None, texts))
    return story_text_map

def parse_spread_files(spread_folder):
    frames = []
    for file in os.listdir(spread_folder):
        if file.startswith("Spread_") and file.endswith(".xml"):
            tree = ET.parse(os.path.join(spread_folder, file))
            root = tree.getroot()

            for textframe in root.findall(".//TextFrame"):
                frame_info = {
                    "type": "text",
                    "id": textframe.attrib.get("Self"),
                    "story_id": textframe.attrib.get("ParentStory"),
                }
                frames.append(frame_info)

            for image in root.findall(".//Image"):
                image_id = image.attrib.get("Self")
                link = image.find(".//Link")
                link_uri = link.attrib.get("LinkResourceURI") if link is not None else "unknown"
                frame_info = {
                    "type": "image",
                    "id": image_id,
                    "link_uri": link_uri
                }
                frames.append(frame_info)
    return frames

def extract_idml_content(idml_root):
    story_folder = os.path.join(idml_root, "Stories")
    spread_folder = os.path.join(idml_root, "Spreads")

    story_map = parse_story_files(story_folder)
    frames = parse_spread_files(spread_folder)

    output = []
    for frame in frames:
        if frame["type"] == "text":
            text = story_map.get(frame["story_id"], "[No text found]")
            output.append({
                "type": "text",
                "story_id": frame["story_id"],
                "frame_id": frame["id"],
                "content": text
            })
        elif frame["type"] == "image":
            output.append({
                "type": "image",
                "frame_id": frame["id"],
                "link_uri": frame["link_uri"]
            })

    return output

# === CONFIGURATION ===
idml_folder = "mc2410_068069ukCAa1a"  # Change this to your folder path

# === EXECUTION ===
result = extract_idml_content(idml_folder)

# === DISPLAY ===
for item in result:
    if item["type"] == "text":
        print(f"\n📝 Text Frame [{item['frame_id']}] (Story {item['story_id']}):\n{item['content']}")
    elif item["type"] == "image":
        print(f"\n🖼️ Image Frame [{item['frame_id']}]: linked to {item['link_uri']}")



📝 Text Frame [u1e1b] (Story u1e1e):
[No text found]

📝 Text Frame [u1be0] (Story u1be3):
[No text found]

📝 Text Frame [u1bf8] (Story u1bfb):
[No text found]

📝 Text Frame [u1db7] (Story u1dba):
[No text found]

📝 Text Frame [u1dcf] (Story u1dd2):
[No text found]

📝 Text Frame [u1de7] (Story u1dea):
[No text found]

📝 Text Frame [u1dff] (Story u1e03):
[No text found]

📝 Text Frame [u1e33] (Story u1e36):
[No text found]

📝 Text Frame [u1e4b] (Story u1e36):
[No text found]

📝 Text Frame [u1e4f] (Story u1e54):
[No text found]

📝 Text Frame [u1ea7] (Story u1eaa):
[No text found]

📝 Text Frame [u1ebf] (Story u1ec2):
[No text found]

📝 Text Frame [u1ed7] (Story u1eda):
[No text found]

📝 Text Frame [u1efb] (Story u1efe):
[No text found]

📝 Text Frame [u1f13] (Story u1f16):
[No text found]

📝 Text Frame [u1f2b] (Story u1f2e):
[No text found]

📝 Text Frame [u1f46] (Story u1f49):
[No text found]

📝 Text Frame [u1f5e] (Story u1f63):
[No text found]

📝 Text Frame [u1f79] (Story u1f7c):
[No text 

In [None]:
import zipfile
import xml.etree.ElementTree as ET
import os
import argparse
import shutil
import tempfile
import re

def extract_idml(idml_path, extract_to):
    """Extracts the contents of an IDML file (which is a zip archive)."""
    try:
        with zipfile.ZipFile(idml_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        # print(f"Successfully extracted {idml_path} to {extract_to}")
        return True
    except zipfile.BadZipFile:
        print(f"Error: {idml_path} is not a valid zip file or is corrupted.")
        return False
    except FileNotFoundError:
        print(f"Error: IDML file not found at {idml_path}")
        return False
    except Exception as e:
        print(f"An error occurred during extraction: {e}")
        return False

def get_story_text(story_path):
    """Parses a Story XML file and extracts all text content."""
    text_content = []
    try:
        tree = ET.parse(story_path)
        root = tree.getroot()
        # Find all Content tags anywhere in the story file
        # Stories can have complex structures (ParagraphStyleRange, CharacterStyleRange, etc.)
        for content_element in root.findall('.//{*}Content'):
            if content_element.text:
                text_content.append(content_element.text.strip())
        # Also find Br tags for line breaks
        for br_element in root.findall('.//{*}Br'):
             text_content.append('\n') # Represent break as newline

        # Join the extracted text pieces. Handle potential None values just in case.
        full_text = "".join(filter(None, text_content))
        # Clean up multiple consecutive newlines that might result from <Br/> tags
        full_text = re.sub(r'\n+', '\n', full_text).strip()
        return full_text
    except ET.ParseError:
        print(f"Warning: Could not parse Story file: {story_path}")
        return ""
    except FileNotFoundError:
        print(f"Warning: Story file not found: {story_path}")
        return ""
    except Exception as e:
        print(f"An error occurred parsing story {story_path}: {e}")
        return ""

def get_spread_content(spread_path, stories_dir):
    """Parses a Spread XML file to find image links and story references."""
    image_links = set()
    story_texts = {} # Dictionary to store {story_id: text}
    referenced_story_ids = set()

    try:
        tree = ET.parse(spread_path)
        root = tree.getroot()

        # Find all Link elements with a LinkResourceURI (likely images)
        # These can be nested within Image, Rectangle, Group etc.
        for link_element in root.findall('.//{*}Link[@LinkResourceURI]'):
            uri = link_element.get('LinkResourceURI')
            if uri:
                image_links.add(uri)

        # Find all TextFrame elements with a ParentStory attribute
        for text_frame in root.findall('.//{*}TextFrame[@ParentStory]'):
            story_id = text_frame.get('ParentStory')
            if story_id:
                referenced_story_ids.add(story_id)

        # Extract text from referenced stories
        for story_id in referenced_story_ids:
            story_filename = f"Story_{story_id}.xml"
            story_path = os.path.join(stories_dir, story_filename)
            story_texts[story_id] = get_story_text(story_path)

        return image_links, story_texts

    except ET.ParseError:
        print(f"Error: Could not parse Spread file: {spread_path}")
        return set(), {}
    except FileNotFoundError:
        print(f"Error: Spread file not found: {spread_path}")
        return set(), {}
    except Exception as e:
        print(f"An error occurred parsing spread {spread_path}: {e}")
        return set(), {}

def find_spread_files(spreads_dir):
    """Finds all Spread XML files in the Spreads directory."""
    spread_files = []
    if not os.path.isdir(spreads_dir):
        print(f"Error: Spreads directory not found: {spreads_dir}")
        return []
    for filename in os.listdir(spreads_dir):
        if filename.startswith("Spread_") and filename.endswith(".xml"):
            spread_files.append(os.path.join(spreads_dir, filename))
    return spread_files

def main():
    parser = argparse.ArgumentParser(description="Extract text and image links from an Adobe IDML file.")
    parser.add_argument("idml_file", help="Path to the .idml file.")
    parser.add_argument("-s", "--spread", help="Specific spread file name to process (e.g., Spread_u160a.xml). If not provided, process all spreads.", default=None)

    args = parser.parse_args()

    # Create a temporary directory for extraction
    temp_dir = tempfile.mkdtemp(prefix="idml_extract_")
    print(f"Extracting IDML to temporary directory: {temp_dir}")

    if not extract_idml(args.idml_file, temp_dir):
        shutil.rmtree(temp_dir) # Clean up temp dir on extraction failure
        return
    
    spreads_dir = os.path.join(temp_dir, "Spreads")
    stories_dir = os.path.join(temp_dir, "Stories")

    if not os.path.isdir(stories_dir):
         print(f"Error: Stories directory not found in the extracted IDML structure: {stories_dir}")
         shutil.rmtree(temp_dir)
         return

    spreads_to_process = []
    if args.spread:
        # Process only the specified spread
        specific_spread_path = os.path.join(spreads_dir, args.spread)
        if os.path.exists(specific_spread_path):
            spreads_to_process.append(specific_spread_path)
        else:
            print(f"Error: Specified spread file not found: {specific_spread_path}")
    else:
        # Process all spreads found
        spreads_to_process = find_spread_files(spreads_dir)
        if not spreads_to_process:
             print("No spread files found to process.")


    all_images = set()
    all_texts = {} # Using spread filename as key: {spread_filename: {story_id: text}}

    # Process the selected spread files
    for spread_path in spreads_to_process:
        print(f"\n--- Processing Spread: {os.path.basename(spread_path)} ---")
        image_links, story_texts = get_spread_content(spread_path, stories_dir)

        if image_links:
            print("\nImage Links Found:")
            for link in sorted(list(image_links)):
                print(f"- {link}")
            all_images.update(image_links) # Add to overall set

        if story_texts:
            print("\nText Content Found:")
            spread_filename = os.path.basename(spread_path)
            all_texts[spread_filename] = {}
            for story_id, text in story_texts.items():
                 if text: # Only print if text was actually extracted
                    print(f"\n[From Story_{story_id}.xml]:")
                    print(text)
                    all_texts[spread_filename][story_id] = text
                 else:
                     print(f"\n[No text content extracted from Story_{story_id}.xml]")
        else:
             print("\nNo text frames referencing stories found in this spread.")


    # Optional: Print summary at the end
    print("\n--- Extraction Summary ---")
    if all_images:
        print(f"\nTotal Unique Image Links Found Across Processed Spreads ({len(all_images)}):")
        for link in sorted(list(all_images)):
            print(f"- {link}")
    else:
        print("\nNo image links found in the processed spreads.")

    if all_texts:
        print(f"\nTotal Text Stories Found Across Processed Spreads:")
        for spread_file, stories in all_texts.items():
             print(f"  Spread: {spread_file} ({len(stories)} stories with content)")
    else:
        print("\nNo text content found in the processed spreads.")


    # Clean up the temporary directory
    try:
        print(f"\nCleaning up temporary directory: {temp_dir}")
        shutil.rmtree(temp_dir)
    except Exception as e:
        print(f"Warning: Could not remove temporary directory {temp_dir}: {e}")

if __name__ == "__main__":
    main()
