In [1]:
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup  # For parsing HTML content

# Load the EPUB file
fileIn = "Leadership_Kissinger.epub"
book = epub.read_epub(fileIn)

item_label_number = {name: getattr(ebooklib, name)  for name in dir(ebooklib) if name.startswith('ITEM_')}

chapters = []
for item in book.items:
    if item.get_type() == item_label_number["ITEM_DOCUMENT"]:
        chapters.append(item)

chapters_html = []
for i, chapter in enumerate(chapters):
    title = chapter.file_name  # File name (often used as chapter title)
    content = chapter.get_content()  # Raw content (HTML or plain text)
    parsed_content = BeautifulSoup(content, 'html.parser')
    text = parsed_content.get_text() # Extract plain text

    if bool(text.strip()):
        chapters_html.append(parsed_content)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [2]:
def extract_elements(chapter_soup):
    """Extract elements from a chapter in sequential order.
    Returns list of tuples (element_type, element)"""
    
    elements = []
    
    # Get all relevant elements in document order
    for element in chapter_soup.find_all(['h1', 'h2', 'p', 'a', 'img', 'blockquote']):
        # Determine element type
        if element.name in ['h1', 'h2']:
            element_type = 'title'
        elif element.name == 'p':
            element_type = 'paragraph'
        elif element.name == 'a' and element.get('class') and any('Reference' in c for c in element['class']):
            element_type = 'note'
        elif element.name == 'img':
            element_type = 'media'
        elif element.name == 'blockquote':
            element_type = 'quote'
        else:
            continue  # Skip any elements that don't match our categories
            
        elements.append((element_type, element))
    
    return elements

# Process all chapters
parsed_chapters = []

for chapter_soup in chapters_html:
    chapter_elements = extract_elements(chapter_soup)
    parsed_chapters.append(chapter_elements)

In [3]:
parsed_chapters[17]

[('paragraph',
  <p class="xFootnote-Text"><span class="Footnote-Reference"><a href="08_Introduction.xhtml#footnote-001-backlink" role="doc-backlink" title="footnote reference">*</a></span> Established in the seventeenth century after the Thirty Years’ War, the Westphalian system grouped the survivor states of that conflict on the basis of national interest and sovereignty to replace the religious or dynastic foundation of the preceding medieval period.</p>)]

In [4]:
intro_index = 5
first_chapter_index = 6
last_chapter_index = 12
notes_chapter_index = 14
first_footnote_index = 17

In [5]:
# Parse Notes by Chapter Title

notes_by_chapter = {}

chapter_notes = {}
aux = []
chapter_index = 0
for entry in parsed_chapters[notes_chapter_index]:
    if entry[0] == 'title':
        if aux:
            notes_by_chapter[chapter_index] = chapter_notes
            aux = []
            chapter_notes = {}
            chapter_index += 1
    elif entry[0]:
        if (len(aux)+1) %2 == 0:
            aux.append("")
        else:
            reference = entry[1].get_text()
            reference_number = f"[{(len(aux)//2)+1}]"
            aux.append(reference)
            chapter_notes[reference_number] = reference
            
if aux:
    notes_by_chapter[chapter_index] = chapter_notes
    aux = []


In [6]:
chapter_indexes = [intro_index]
chapter_indexes.extend(list(range(first_chapter_index, last_chapter_index + 1)))
notes_indexes = list(notes_by_chapter.keys())


In [7]:
def fill_notes(index, current_footnote_index, updated_chapters, notes_by_chapter):
    chapter = updated_chapters[chapter_indexes[index]]
    chapter_notes = notes_by_chapter[notes_indexes[index]]
    
    note_id_cnt = 1
    last_paragraph_with_footnote = None
    last_paragraph_with_note = None
    footnote_paragraph_map = []
    note_paragraph_map = []
    footnote_contents = {}
    note_contents = {}
    
    # First pass: collect footnotes and notes
    for i, item in enumerate(chapter):
        note_id = f"[{note_id_cnt}]"
        if item[0] == 'paragraph':
            if '[*]' in item[1].get_text():
                last_paragraph_with_footnote = i
            if note_id in item[1].get_text():
                last_paragraph_with_note = i
                note_paragraph_map.append((i, note_id))
        elif item[0] == "note":
            if note_id in chapter_notes.keys():
                note_contents[note_id] = chapter_notes[note_id]
                note_id_cnt += 1
            else:
                footnote_content = f"([*] -> '''<i>{updated_chapters[current_footnote_index][0][1].get_text()}</i>'''"
                footnote_contents[last_paragraph_with_footnote] = footnote_content
                footnote_paragraph_map.append((current_footnote_index, last_paragraph_with_footnote))
                current_footnote_index += 1

    # Process footnotes
    for (_, para_index) in footnote_paragraph_map:
        if para_index is not None and para_index < len(chapter):
            tag = chapter[para_index][1]
            # Get the text content while preserving the tag
            text = tag.string if tag.string else tag.get_text()
            footnote_content = footnote_contents[para_index]
            note_id = "[*]"
            new_text = text.replace(note_id, f" ({note_id} : ' {footnote_content} ') ")
            # Update the tag's content while preserving the tag itself
            tag.string = new_text
            chapter[para_index] = ('paragraph', tag)

    # Process notes
    for (para_index, note_id) in note_paragraph_map:
        if note_id in note_contents:
            tag = chapter[para_index][1]
            # Get the text content while preserving the tag
            text = tag.string if tag.string else tag.get_text()
            note_content = note_contents[note_id]
            new_text = text.replace(note_id, f" ({note_id} : ' {note_content} ') ")
            # Update the tag's content while preserving the tag itself
            tag.string = new_text
            chapter[para_index] = ('paragraph', tag)

    # Remove original note items
    chapter = [(type_, content) for type_, content in chapter if type_ != 'note']
    
    updated_chapters[chapter_indexes[index]] = chapter
    return current_footnote_index, updated_chapters


In [8]:

current_footnote_index = first_footnote_index
updated_chapters_with_notes = parsed_chapters
for i in range(len(chapter_indexes)):
    current_footnote_index, updated_chapters_with_notes = fill_notes(i, current_footnote_index, updated_chapters_with_notes, notes_by_chapter)

In [9]:

book_by_chapters = {}

for chapter_index in chapter_indexes:
    if chapter_index == intro_index or chapter_index == last_chapter_index: # Introduction has the title as  first
        book_by_chapters[updated_chapters_with_notes[chapter_index][0][1].get_text()] =  updated_chapters_with_notes[chapter_index][1:]
    else: # Only has proper title in 3rd pos
        updated_chapters_with_notes[chapter_index][0]
        book_by_chapters[updated_chapters_with_notes[chapter_index][2][1].get_text()] =  updated_chapters_with_notes[chapter_index][2:]

In [10]:
from bs4 import BeautifulSoup, Tag
import json

def tag_to_dict(tag):
    """Convert a BeautifulSoup Tag to a dictionary."""
    if not isinstance(tag, Tag):
        return str(tag)
    
    return {
        'name': tag.name,
        'attrs': dict(tag.attrs),
        'contents': [tag_to_dict(child) for child in tag.contents]
    }

def dict_to_tag(d):
    """Convert a dictionary back to a BeautifulSoup Tag."""
    if not isinstance(d, dict):
        return d
    
    # Create a new Tag
    soup = BeautifulSoup("", 'html.parser')
    tag = soup.new_tag(d['name'])
    
    # Add attributes
    for key, value in d['attrs'].items():
        tag[key] = value
    
    # Add contents
    for content in d['contents']:
        if isinstance(content, dict):
            tag.append(dict_to_tag(content))
        else:
            tag.append(content)
    
    return tag

def save_chapters_to_json(book_by_chapters, filename):
    """Save chapters to JSON file with converted Tags."""
    serializable_dict = {}
    for title, content in book_by_chapters.items():
        serializable_dict[title] = [
            (type_info, tag_to_dict(tag)) if isinstance(tag, Tag) else (type_info, tag)
            for type_info, tag in content
        ]
    
    with open(filename, 'w', encoding='utf-8') as fp:
        json.dump(serializable_dict, fp, ensure_ascii=False, indent=2)

def load_chapters_from_json(filename):
    """Load chapters from JSON file and convert back to Tags."""
    with open(filename, 'r', encoding='utf-8') as fp:
        loaded_dict = json.load(fp)
    
    restored_dict = {}
    for title, content in loaded_dict.items():
        restored_dict[title] = [
            (type_info, dict_to_tag(tag_dict) if isinstance(tag_dict, dict) else tag_dict)
            for type_info, tag_dict in content
        ]
    
    return restored_dict

In [11]:
# Save to JSON
save_chapters_to_json(book_by_chapters, 'Leadership_Kissinger.json')

# Load from JSON
restored_book = load_chapters_from_json('Leadership_Kissinger.json')

In [12]:
def chapter_to_html(chapter_title, chapter_content):
    """
    Convert a chapter's content to a complete HTML file.
    
    Args:
        chapter_title (str): The title of the chapter
        chapter_content (list): List of tuples containing (type, BeautifulSoup Tag)
    
    Returns:
        str: Complete HTML document as a string
    """
    html_template = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <style>
        body {{
            max-width: 800px;
            margin: 40px auto;
            padding: 0 20px;
            font-family: Georgia, serif;
            line-height: 1.6;
            color: #333;
        }}
        h1 {{
            font-size: 2.5em;
            margin-bottom: 1em;
            text-align: center;
            color: #2c3e50;
        }}
        h2 {{
            font-size: 1.8em;
            margin-top: 1.5em;
            color: #34495e;
        }}
        .x04-Body-Text, .x04-Body-Text-FL {{
            margin-bottom: 1.2em;
        }}
        .note {{
            font-size: 0.9em;
            color: #666;
            margin-left: 2em;
            padding-left: 1em;
            border-left: 3px solid #ddd;
        }}
        i {{
            font-style: italic;
        }}
        .Endnote-Reference {{
            vertical-align: super;
            font-size: 0.8em;
            color: #666;
            text-decoration: none;
        }}
        @media (max-width: 600px) {{
            body {{
                margin: 20px auto;
                padding: 0 10px;
            }}
        }}
    </style>
</head>
<body>
    <h1>{title}</h1>
    <main>
        {content}
    </main>
</body>
</html>"""

    # Process the content
    main_content = []
    notes = []
    
    for content_type, content in chapter_content:
        if content_type == 'note':
            content = f"(<i>{content}</i>)"
            main_content.append(content)
        else:
            # Add regular content
            if isinstance(content, str):
                main_content.append(content)
            else:
                main_content.append(str(content))
    
    # Combine main content and notes
    all_content = '\n'.join(main_content + notes)
    
    # Format the complete HTML document
    html_doc = html_template.format(
        title=chapter_title,
        content=all_content
    )
    
    return html_doc

def save_chapter_as_html(chapter_title, chapter_content, filename):
    """
    Save a chapter as an HTML file.
    
    Args:
        chapter_title (str): The title of the chapter
        chapter_content (list): List of tuples containing (type, BeautifulSoup Tag)
        filename (str): Output filename
    """
    html_content = chapter_to_html(chapter_title, chapter_content)
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

# Convert and save the Introduction chapter
save_chapter_as_html('Introduction', restored_book['Introduction'], 'introduction.html')

In [13]:
print()


