## Project: Create epub from html file.
#### Author: Samuel Moreno

****

We are going to create an Epub document from a Gutenberg file (found in https://www.gutenberg.org), This code allows you to get a basic epub file, with a Cover Image and Table of Contents in order to transfer it to your Kindle or whatever.

This project does not seek any commercial use, instead, it provides a nice training for us as Python programmers.

Lets import everything we need:

In [179]:
import requests
from bs4 import BeautifulSoup
from ebooklib import epub
import os
import re
from html import escape

Now we create functions to clean title and file contents:

In [180]:
def sanitize_html(content):
    """Clean HTML to ensure compatibility"""
    content = re.sub(r'<\?xml[^>]*\?>', '', content)
    content = re.sub(r'<!DOCTYPE[^>]*>', '', content)
    content = re.sub(r'</?o:[^>]*>', '', content)
    content = re.sub(r'<\w+:[^>]*>', '', content)
    content = re.sub(r'xmlns:.*?=["\'](.*?)["\']', '', content)
    # Remove empty tags
    content = re.sub(r'<[^>]*?/\s*>', '', content)
    # Remove comments
    content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
    return content

def clean_title(title):
    """Clean title"""
    title = re.sub(r'[^\w\s-]', '', title).strip()
    title = re.sub('The Project Gutenberg eBook of ', '', title).strip()
    return title if title else "Default_Title"

def clean_toc(soup):
    """Remove duplicate table of contents and clean up volume headers"""
    # Find the first CONTENTS header
    first_toc = None
    for header in soup.find_all(['h2']):
        if header.get_text().strip().upper() == 'CONTENTS':
            first_toc = header
            break
    
    if first_toc:
        # Find all content after this header until the next major section
        current = first_toc.next_sibling
        toc_content = []
        while current and (not current.name in ['h2'] or 
                         (current.name == 'h2' and 'LIST OF ILLUSTRATIONS' not in current.get_text().upper())):
            if isinstance(current, str) or current.name == 'p':
                toc_content.append(current)
            current = current.next_sibling
        
        # Keep only the content with links
        for content in toc_content:
            if isinstance(content, str):
                continue
            if not content.find('a'):
                content.decompose()
    
    # Remove any additional CONTENTS sections
    for header in soup.find_all(['h2']):
        if header != first_toc and header.get_text().strip().upper() == 'CONTENTS':
            # Remove the header and all following paragraphs until next header
            current = header
            while current and (isinstance(current, str) or current.name != 'h2'):
                next_elem = current.next_sibling
                current.decompose()
                current = next_elem

    # Clean up duplicate Volume II headers
    volume_headers = soup.find_all('h3', string=lambda text: text and 'VOLUME II' in text.upper())
    if len(volume_headers) > 1:
        for header in volume_headers[1:]:
            header.decompose()

    return soup

def create_chapter(title, content, idx, style_path="style/nav.css"):
    """Create an EPUB chapter"""
    chapter = epub.EpubHtml(
        title=title,
        file_name=f'chapter_{idx:03d}.xhtml',
        lang='en'
    )
    
    chapter.content = f'''
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <title>{escape(title)}</title>
        <link rel="stylesheet" type="text/css" href="{style_path}"/>
    </head>
    <body>
        <h2>{escape(title)}</h2>
        {content}
    </body>
    </html>
    '''
    
    return chapter

def remove_empty_pages(content):
    # soup = BeautifulSoup(content, 'html.parser')

    # Iterate over all top-level elements (e.g., <body> children)
    for element in soup.body.contents:
        # Check if the element is empty or contains only whitespace
        if not element.string.strip(): # and not element.find_all(recursive=False):
            element.extract()

    return soup

1 - Download the html file:

In [181]:
url = "https://www.gutenberg.org/files/6941/6941-h/6941-h.htm"
response = requests.get(url)
html_content = response.text

2 - Parse the html content and clean empty pages.

In [182]:
soup = BeautifulSoup(html_content,'html.parser')

# clean_content = remove_empty_pages(soup)
soup = clean_toc(soup)

3 - Extract Title from html 

In [183]:
title = soup.title.string if soup.title and soup.title.string else "Libro de Ash"
title = clean_title(title)
author = "Sir Walter Scott" 

4 - Create the EPUB file

In [184]:
book = epub.EpubBook()
book.set_identifier('id123456')   # this is a custom made identifier
book.set_title(title)
book.add_metadata('DC', 'language', 'en')
# File Metadata
book.add_metadata('DC', 'creator', author)
book.add_metadata('DC', 'publisher', 'Project Gutenberg')
book.add_metadata('DC', 'rights', 'Publi    c Domain')


5 - We add some style to the file

In [185]:
style = '''
@namespace epub "http://www.idpf.org/2007/ops";
body {
    font-family: "Helvetica", "Arial", sans-serif;
    line-height: 1.5;
    margin: 5%;
}
h1, h2, h3 { 
    text-align: center;
    margin: 1em 0;
}
p { 
    text-indent: 1em;
    margin: 0.5em 0;
}
'''
nav_css = epub.EpubItem(
    uid="style_nav",
    file_name="style/nav.css",
    media_type="text/css",
    content=style
)
book.add_item(nav_css)

<ebooklib.epub.EpubItem at 0x1f60b43dc50>

6 - Now we add the Cover Image

In [186]:
try:
    with open("OM.jpg", "rb") as file:
        cover_content = file.read()
        cover_image = epub.EpubItem(
            uid="cover_image",
            file_name="images/cover.jpg",
            media_type="image/jpeg",
            content=cover_content
        )
        book.add_item(cover_image)
        book.set_cover("images/cover.jpg", cover_content)
except FileNotFoundError:
    print("Cover image not found, continuing without cover...")

7 - Add chapters to the book

In [187]:
chapters = []
main_content = soup.find('body')
if main_content:
    chapter_markers = main_content.find_all(['h2', 'h3'])
    
    for idx, chapter_heading in enumerate(chapter_markers, 1):
        chapter_title = chapter_heading.get_text().strip()
        if not chapter_title:
            continue
            
        # Collect each chapter content
        chapter_content = []
        current = chapter_heading.next_sibling
        while current and current.name not in ['h2', 'h3']:
            if hasattr(current, 'name'):
                chapter_content.append(str(current))
            current = current.next_sibling
        
        content_html = ''.join(chapter_content)
        content_html = sanitize_html(content_html)
        
        # Create chapter
        chapter = create_chapter(chapter_title, content_html, idx)
        book.add_item(chapter)
        chapters.append(chapter)

8 - Generate Chapter List and Spine

In [188]:
book.toc = chapters
book.spine = ['nav'] + chapters
book.add_item(epub.EpubNav())

<ebooklib.epub.EpubNav at 0x1f60d5a6990>

10 - Write the Epub file

In [189]:
output_filename = f"{clean_title(title)}.epub"
epub.write_epub(output_filename, book, {
    'epub3_pages': False,
    'spine_direction': 'ltr'
})

print(f"EPUB file '{output_filename}' has been created successfully!")

EPUB file 'Old Mortality by Sir Walter Scott.epub' has been created successfully!
