In [118]:
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Tuple, Optional, Any

class EPUBProcessor:
    """Process EPUB files into structured content with proper handling of chapters, 
    footnotes, and references."""
    
    def __init__(self, epub_path: str):
        """Initialize the processor with an EPUB file path.
        
        Args:
            epub_path (str): Path to the EPUB file
        """
        self.epub_path = epub_path
        self.book = None
        self.chapters = []
        self.chapters_html = []
        self.book_by_chapters = {}
        self.notes_by_chapter = {}
        
    def load_epub(self) -> None:
        """Load and parse the EPUB file."""
        try:
            self.book = epub.read_epub(self.epub_path)
            self._extract_chapters()
            self._parse_chapters()
        except Exception as e:
            raise Exception(f"Error loading EPUB file: {str(e)}")

    def _extract_chapters(self) -> None:
        """Extract chapters from the EPUB file."""
        item_label_number = {
            name: getattr(ebooklib, name) 
            for name in dir(ebooklib) if name.startswith('ITEM_')
        }
        
        self.chapters = [
            item for item in self.book.items 
            if item.get_type() == item_label_number["ITEM_DOCUMENT"]
        ]

    def _parse_chapters(self) -> None:
        """Parse chapters into BeautifulSoup objects."""
        self.chapters_html = []
        for chapter in self.chapters:
            content = chapter.get_content()
            parsed_content = BeautifulSoup(content, 'html.parser')
            text = parsed_content.get_text()
            
            if bool(text.strip()):
                self.chapters_html.append(parsed_content)

    def extract_elements(self, chapter_soup: BeautifulSoup) -> List[Tuple[str, Any]]:
        """Extract elements from a chapter in sequential order, including classes and IDs.
        
        Args:
            chapter_soup (BeautifulSoup): BeautifulSoup object of chapter content
            
        Returns:
            List[Tuple[str, Any]]: List of tuples containing element type, content, classes, and IDs
        """
        elements = []
        for element in chapter_soup.find_all(['h1', 'h2', 'p', 'a', 'img', 'blockquote']):
            # Determine element type
            if element.name in ['h1', 'h2']:
                element_type = 'title'
            elif element.name == 'p':
                element_type = 'paragraph'
            elif element.name == 'a' and element.get('class') and any('Reference' in c for c in element['class']):
                element_type = 'note'
            elif element.name == 'img':
                element_type = 'media'
            elif element.name == 'blockquote':
                element_type = 'quote'
            else:
                continue
    
            # Fetch classes and IDs
            element_classes = element.get('class', [])  # Returns a list of classes or an empty list
            element_id = element.get('id', None)       # Returns the ID or None
    
            # Append element type, content, classes, and ID
            elements.append((element_type, element, element_classes, element_id))
        
        return elements

    def process_chapters(self) -> List[List[Tuple[str, Any]]]:
        """Process all chapters and extract their elements.
        
        Returns:
            List[List[Tuple[str, Any]]]: List of processed chapters
        """
        return [self.extract_elements(chapter_soup) for chapter_soup in self.chapters_html]

    def parse_notes(self, notes_chapter_index: int) -> Dict[int, Dict[str, str]]:
        """Parse notes from the notes chapter.
        
        Args:
            notes_chapter_index (int): Index of the notes chapter
            
        Returns:
            Dict[int, Dict[str, str]]: Dictionary of notes by chapter
        """
        notes_by_chapter = {}
        chapter_notes = {}
        aux = []
        chapter_index = 0
        
        for entry in self.process_chapters()[notes_chapter_index]:
            if entry[0] == 'title':
                if aux:
                    notes_by_chapter[chapter_index] = chapter_notes
                    aux = []
                    chapter_notes = {}
                    chapter_index += 1
            elif entry[0]:
                if (len(aux)+1) % 2 == 0:
                    aux.append("")
                else:
                    reference = entry[1].get_text()
                    reference_number = f"[{(len(aux)//2)+1}]"
                    aux.append(reference)
                    chapter_notes[reference_number] = reference
                    
        if aux:
            notes_by_chapter[chapter_index] = chapter_notes
            
        return notes_by_chapter

    def fill_notes(self, chapter_index: int, current_footnote_index: int, 
                  updated_chapters: List[List[Tuple[str, Any]]], notes_by_chapter: Dict[int, Dict[str, str]], 
                  chapter_indexes: List[int], notes_indexes: List[int]) -> Tuple[int, List[List[Tuple[str, Any]]]]:
        """Fill in notes for a chapter.
        
        Args:
            chapter_index (int): Index of the chapter
            current_footnote_index (int): Current footnote index
            updated_chapters (List[List[Tuple[str, Any]]]): Chapters with updates
            notes_by_chapter (Dict[int, Dict[str, str]]): Notes organized by chapter
            chapter_indexes (List[int]): List of chapter indexes
            notes_indexes (List[int]): List of notes indexes
            
        Returns:
            Tuple[int, List[List[Tuple[str, Any]]]]: Updated footnote index and chapters
        """
        chapter = updated_chapters[chapter_indexes[chapter_index]]
        chapter_notes = notes_by_chapter[notes_indexes[chapter_index]]
        
        note_id_cnt = 1
        last_paragraph_with_footnote = None
        footnote_paragraph_map = []
        note_paragraph_map = []
        footnote_contents = {}
        note_contents = {}
        
        # Collect footnotes and notes
        for i, item in enumerate(chapter):
            note_id = f"[{note_id_cnt}]"
            if item[0] == 'paragraph':
                if '[*]' in item[1].get_text():
                    last_paragraph_with_footnote = i
                if note_id in item[1].get_text():
                    note_paragraph_map.append((i, note_id))
            elif item[0] == "note":
                if note_id in chapter_notes.keys():
                    note_contents[note_id] = chapter_notes[note_id]
                    note_id_cnt += 1
                else:
                    footnote_content = f"([*] -> '''{updated_chapters[current_footnote_index][0][1].get_text()}'''"
                    footnote_contents[last_paragraph_with_footnote] = footnote_content
                    footnote_paragraph_map.append((current_footnote_index, last_paragraph_with_footnote))
                    current_footnote_index += 1

        # Process footnotes and notes
        for (_, para_index) in footnote_paragraph_map:
            if para_index is not None and para_index < len(chapter):
                tag = chapter[para_index][1]
                text = tag.string if tag.string else tag.get_text()
                footnote_content = footnote_contents[para_index]
                new_text = text.replace('[*]', f" ([*] : ' {footnote_content} ') ")
                tag.string = new_text
                chapter[para_index] = ('paragraph', tag)

        for (para_index, note_id) in note_paragraph_map:
            if note_id in note_contents:
                tag = chapter[para_index][1]
                text = tag.string if tag.string else tag.get_text()
                note_content = note_contents[note_id]
                new_text = text.replace(note_id, f" ({note_id} : ' {note_content} ') ")
                tag.string = new_text
                chapter[para_index] = ('paragraph', tag)

        # Remove original note items
        chapter = [(type_, content) for type_, content in chapter if type_ != 'note']
        
        updated_chapters[chapter_indexes[chapter_index]] = chapter
        return current_footnote_index, updated_chapters

    def organize_chapters(self, intro_index: int, first_chapter_index: int, 
                        last_chapter_index: int, processed_chapters: List[List[Tuple[str, Any]]]) -> Dict[str, List[Tuple[str, Any]]]:
        """Organize chapters into a dictionary with proper titles.
        
        Args:
            intro_index (int): Index of introduction
            first_chapter_index (int): Index of first chapter
            last_chapter_index (int): Index of last chapter
            processed_chapters (List[List[Tuple[str, Any]]]): Processed chapter content
            
        Returns:
            Dict[str, List[Tuple[str, Any]]]: Organized chapters by title
        """
        book_by_chapters = {}
        chapter_indexes = [intro_index]
        chapter_indexes.extend(list(range(first_chapter_index, last_chapter_index + 1)))
        
        for chapter_index in chapter_indexes:
            if chapter_index == intro_index or chapter_index == last_chapter_index:
                # Introduction has the title as first
                book_by_chapters[processed_chapters[chapter_index][0][1].get_text()] = processed_chapters[chapter_index][1:]
            else:
                # Only has proper title in 3rd pos
                book_by_chapters[processed_chapters[chapter_index][2][1].get_text()] = processed_chapters[chapter_index][2:]
                
        return book_by_chapters

    def export_json(self, filename: str) -> None:
        """Export the processed book to JSON.
        
        Args:
            filename (str): Output JSON filename
        """
        def tag_to_dict(tag):
            """Convert a BeautifulSoup Tag to a dictionary."""
            if not isinstance(tag, BeautifulSoup.Tag):
                return str(tag)
            
            return {
                'name': tag.name,
                'attrs': dict(tag.attrs),
                'contents': [tag_to_dict(child) for child in tag.contents]
            }
        
        serializable_dict = {}
        for title, content in self.book_by_chapters.items():
            serializable_dict[title] = [
                (type_info, tag_to_dict(tag)) if isinstance(tag, BeautifulSoup.Tag) else (type_info, tag)
                for type_info, tag in content
            ]
        
        with open(filename, 'w', encoding='utf-8') as fp:
            json.dump(serializable_dict, fp, ensure_ascii=False, indent=2)

In [119]:
processor = EPUBProcessor("aron.epub")
processor.load_epub()

# Process chapters
processed_chapters = processor.process_chapters()

In [120]:
import requests
import json

# Function to summarize text using ChatGPT
def helpful_aid( messages):
    
    payload = {
        "model": "phi-4@q6_k", 
        "messages": messages,
    }
    
    
    response = requests.post(
        "http://localhost:24236/v1/chat/completions",
        headers={"Content-Type": "application/json"},
        json=payload,
        timeout=15
    )
    
    if response.status_code == 200:
        result = response.json()
        summary = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
        return summary
    else:
        raise Exception()


    

In [122]:
import json

print(f"{len(processed_chapters)} Chapters")

chapters = ""

for i, c in enumerate(processed_chapters):
    chapters += f"[{i}] ({c[0][0]}) -> {c[0][1]}"

SYSTEM_INSTRUCTION = "You map jsons and only return the json structure without any tags or text beside it. dont use ```json and ``` around the result."
messages=[{"role": "system", "content": SYSTEM_INSTRUCTION },
          {"role": "user", "content": f"Only return me a json without any tags around it mapping the index [i] to the chapter / section of the book. Here is the listing: {chapters}. Dont return ```json or ```, nor any \n"}]

json_str = helpful_aid(messages)
json_str = json_str.replace("```json", "").replace("```", "")

chapter_dict = json.loads(json_str)

22 Chapters


In [123]:
chapter_dict

{'0': 'The Opium of the Intellectuals',
 '1': 'FOREWORD TO THE TRANSACTION EDITION',
 '2': 'INTRODUCTION TO THE TRANSACTION EDITION',
 '3': 'FOREWORD',
 '4': 'PART ONE',
 '5': 'CHAPTER I',
 '6': 'CHAPTER II',
 '7': 'CHAPTER III',
 '8': 'CONCERNING POLITICAL OPTIMISM',
 '9': 'PART II',
 '10': 'CHAPTER IV',
 '11': 'CHAPTER V',
 '12': 'CHAPTER VI',
 '13': 'THE CONTROL OF HISTORY',
 '14': 'PART THREE',
 '15': 'CHAPTER VII',
 '16': 'CHAPTER VIII',
 '17': 'CHAPTER IX',
 '18': 'THE DESTINY OF THE INTELLECTUALS',
 '19': 'CONCLUSION',
 '20': 'APPENDIX',
 '21': 'INDEX'}

In [124]:
classes = {}

for cI, c in enumerate(processed_chapters):
    for sI, s in enumerate(c):
        for cla in s[2]:
            identifier = (cI, sI)
            if cla in classes.keys():
                classes[cla].append(identifier)
            else:
                classes[cla] = [identifier]
for k in classes:
    print(f"{k} -> {len(classes[k])}")

gtxt_body -> 1188
gtxt_lineated -> 3
gtxt_quote -> 5
gtxt_heading -> 35
gtxt_h1_heading -> 41
gtxt_footnote -> 52
gtxt_list_entry -> 6


In [125]:
for cI, c in enumerate(processed_chapters):
    for sI, s in enumerate(c):
        classes_tags = s[2]
        if 'gtxt_heading' in classes_tags or 'gtxt_h1_heading' in classes_tags:
            typ = 'title'
            text = processed_chapters[cI][sI][1].get_text()

            processed_chapters[cI][sI] = (typ, text)
            
        elif 'gtxt_footnote' in s[2]:
            typ = 'footnote'
            text = processed_chapters[cI][sI][1].get_text()

            processed_chapters[cI][sI] = (typ, text)

        elif 'gtxt_quote' in s[2]:
            typ = s[0]
            text = "\n<quote>\n" + processed_chapters[cI][sI][1].get_text() + "\n<quote>\n"

            processed_chapters[cI][sI] = (typ, text)

        elif 'gtxt_list_entry' in s[2]:
            typ = s[0]
            text = "\t" + processed_chapters[cI][sI][1].get_text()

            processed_chapters[cI][sI] = (typ, text)
        else:
            processed_chapters[cI][sI] = (s[0], s[1].get_text())

In [126]:
lin = classes['gtxt_list_entry'][1]
print(f"Chapter {lin[0]} Section {lin[1]} -> {processed_chapters[lin[0]][lin[1]][1]}")

Chapter 20 Section 97 -> 	2. This doctrine, which is axiomatic for the author of  et le néant, cannot be attributed without reservation to the author of La Phénoménologie de perception. 


In [127]:
set([s[0] for s in sole_chapter])

{'footnote', 'paragraph', 'title'}

In [None]:
import requests

SYSTEM_INSTRUCTION = "You are a helpful assistant that summarizes paragraphs from books. Return markdown formatting without any tags around it."
# Function to summarize text using ChatGPT
def summarize_text_LMSTUDIO(text, messages, tries=3):
    
    payload = {
        "model": "phi-4@q6_k", 
        "messages": messages,
    }
    
    

    while tries > 0:
        try:
            response = requests.post(
                "http://localhost:24236/v1/chat/completions",
                headers={"Content-Type": "application/json"},
                json=payload,
                timeout=15
            )
            if response.status_code == 200:
                result = response.json()
                summary = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
                return summary, messages
            else:
                tries -=1 
        except:
            tries -=1

    raise Exception("Tries ran out")


def summarize_text(text, messages=[
            {"role": "system", "content": SYSTEM_INSTRUCTION },
            {"role": "system", "content": f"Tell the user he forgot to set the messages!"}
        ]):
    return summarize_text_LMSTUDIO(text, messages)

In [6]:
title = "Opium"

# FILL NOTES (considering paragraph)
# Look for *number put second in the first

In [5]:

# Export to JSON and HTML
processor.export_json("book_processed.json")
for title, content in processor.book_by_chapters.items():
    processor.export_html(title, content, f"{title.lower().replace(' ', '_')}.html")