In [154]:
import pandas as pd
import os
import pymupdf  
import numpy as np
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from typing import List
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from dotenv import load_dotenv

In [155]:
load_dotenv()

True

In [156]:
from mistralai import Mistral
from pathlib import Path

from mistralai.models import OCRResponse
from IPython.display import Markdown, display
import json

## Overview

### Preprocessing
- Extracting the text from the document via direct parsing or converting it into md using Mistral OCR then process
- The reason behind this failsafe this because sometimes the document is scanned in and PyMuPDF doesn't recognize texts

In [157]:


def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)


In [158]:
def extract_text_with_Mistral_OCR(pdf_path):
    api_key = os.environ["MISTRAL_API_KEY"]
    if not api_key:
        raise ValueError("MISTRAL_API_KEY not found in environment variables")
    
    client = Mistral(api_key=api_key)
    uploaded_file = client.files.upload(
    file={
            "file_name": pdf_path,
            "content": open(pdf_path, "rb"),
        },
        purpose="ocr",
    )
    print(client.files.retrieve(file_id=uploaded_file.id))
    # Process the PDF with Mistral OCR
    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    pdf_response = client.ocr.process(document={"type": "document_url","document_url": signed_url.url,}, model="mistral-ocr-latest", include_image_base64=True)
    
    # Display combined markdowns and images
    # display(Markdown(get_combined_markdown(blocks)))
    # We process it and save the combined markdown to a file
    with open("combined_markdown.md", "w", encoding="utf-8") as f:
        f.write(get_combined_markdown(pdf_response))
    #SAVE THE RESPONSE TO A JSON FILE
    all_pages = []
    page_dimensions = []
    ## TODO: This will need more work TO better process
    for page_num, i in enumerate(pdf_response.pages):
        all_pages.append({"page": page_num, "text":i.markdown, "page_dimensions": i.dimensions, "images": i.images})
    return (all_pages, 'Mistral OCR')

In [159]:
# extract_text_with_Mistral_OCR(r'D:\DATA300\AudioBookSum\pdf\Schoonover.pdf')

In [160]:
from sklearn.cluster import DBSCAN
import numpy as np

def cluster_blocks_dbscan(blocks, eps=20, min_samples=2):
    """Cluster blocks using DBSCAN algorithm."""
    # Extract centroids of all blocks
    centroids = np.array([[
        (block['bbox'][0] + block['bbox'][2])/2,  # x center
        (block['bbox'][1] + block['bbox'][3])/2   # y center
    ] for block in blocks])
    
    # Run DBSCAN
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(centroids)
    
    # Group blocks by cluster
    clusters = {}
    for i, label in enumerate(clustering.labels_):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(blocks[i])
    
    return list(clusters.values())

In [161]:
def process_text_blocks(blocks, eps=20, min_samples=1):
    """
    Cluster text blocks, sort blocks within each cluster, and merge them.
    
    Args:
        blocks: List of text block dictionaries with bbox and text
        eps: DBSCAN epsilon parameter (clustering distance threshold)
        min_samples: DBSCAN min_samples parameter
        
    Returns:
        List of merged text blocks
    """
    # Skip processing if no blocks
    if not blocks:
        return []
    
    # 1. Cluster blocks using DBSCAN
    clusters = cluster_blocks_dbscan(blocks, eps, min_samples)
    
    # 2. Process each cluster
    merged_blocks = []
    for cluster in clusters:
        # Sort blocks in reading order (top-to-bottom)
        sorted_blocks = sorted(cluster, key=lambda b: (b['bbox'][2], b['bbox'][0]), reverse=False)
        
        if not sorted_blocks:
            continue
            
        # 3. Merge blocks in the cluster
        merged_text = ""
        first_block = sorted_blocks[0]
        x0, y0 = first_block['bbox'][0], first_block['bbox'][1]
        x1, y1 = first_block['bbox'][2], first_block['bbox'][3]
        block_no = first_block['block_no']
        block_type = first_block['block_type']
        page = first_block['page']
        
        prev_y1 = y0  # Track previous block's bottom coordinate
        
        for block in sorted_blocks:
            # Update bounding box to encompass all blocks
            x0 = min(x0, block['bbox'][0])
            y0 = min(y0, block['bbox'][1])
            x1 = max(x1, block['bbox'][2])
            y1 = max(y1, block['bbox'][3])
            
            # Add line break if there's significant vertical gap
            if merged_text and (block['bbox'][1] - prev_y1) > 0.5 * (block['text_height']):
                merged_text += "\n"
                
            # Add space only if needed (avoid double spaces)
            if merged_text and not merged_text.endswith("\n"):
                merged_text += " "
                
            merged_text += block['text']
            prev_y1 = block['bbox'][3]
        
        # 4. Create a new merged block
        merged_block = {
            "page": page,
            "bbox": (x0, y0, x1, y1),
            "text": merged_text,
            "block_no": block_no,
            "block_type": block_type,
            "text_height": block['text_height'],
            "is_merged_cluster": True
        }
        
        merged_blocks.append(merged_block)
    
    return merged_blocks

In [162]:
def extract_text(pdf_path:str)-> tuple[List[dict], str]:
    """PyMuPDF-based function to extract text with bounding boxes from a PDF file."""
    try:
        doc = pymupdf.open(pdf_path, filetype="pdf")
        prev_block = None
        all_blocks = []
        for page_num, page in enumerate(doc):
            words = page.get_text("words")
            page_block = []
            # Take threshold based on page_width and page_height
            WIDTH_threshold = (0.02 if page.rect.width > page.rect.height else 0.0092625) * page.rect.width
            HEIGHT_threshold = (0.01 if page.rect.width > page.rect.height else 0.05) * page.rect.height
            for curr_word in words:
                # Each block is (x0, y0, x1, y1, text, block_no, block_type)
                x0, y0, x1, y1, text, block_no, line_no, block_type = curr_word
                text_height = y1 - y0
                if not text.strip():
                    continue
                is_mergable = False
                
                """
                Check if the current block is close to the previous block.
                The conditions are:
                (1. The x-coordinates of the current block are within WIDTH_threshold of the previous block.
                2. The y-coordinates of the current block are within HEIGHT_threshold of the previous block.
                (3. The current block is not completely to the left of the previous block.
                4. The y-coordinates of the current block are within 4 pixels of the previous block.
                5. The current block is not completely to the right of the previous block.
                """
                if prev_block and abs(text_height - prev_block[-1]) <= 0 and \
                    (\
                        (abs(x0 - prev_block[2]) <= WIDTH_threshold and (y0 -prev_block[3]) <= HEIGHT_threshold) \
                        # or ((not (prev_block[2] < x0 and x1 < prev_block[0])) and (y0 -prev_block[3]) <= 8)\
                    ):
                    prev_block[2] = max(prev_block[2], x1) 
                    prev_block[3] = max(prev_block[3], y1) 
                    prev_block[4] += " " + text.strip()
                    is_mergable = True

                if is_mergable and page_block:
                    page_block.pop()
                    page_block.append(
                        {
                            "page": page_num,
                            "bbox": (prev_block[0], prev_block[1], prev_block[2], prev_block[3]),
                            "text": prev_block[4],
                            "block_no": block_no,
                            "block_type": block_type,
                            "text_height": prev_block[-1]
                        }
                    )    
                    prev_block = [prev_block[0], prev_block[1], prev_block[2], prev_block[3], prev_block[4], block_no, block_type, prev_block[-1]]
                else:
                    page_block.append({
                        "page": page_num,
                        "bbox": (x0, y0, x1, y1),
                        "text": text.strip(),
                        "block_no": block_no,
                        "block_type": block_type,
                        "text_height": text_height
                    })
                    # Update the previous block
                    prev_block = [x0, y0, x1, y1, text.strip(), block_no, block_type, text_height]
            merged_blocks = process_text_blocks(page_block, eps=25, min_samples=1)
            all_blocks.extend(merged_blocks)
        doc.close()
        # Remove empty blocks
        if not all_blocks:
            raise Exception("PyMuPDF Failed or No text found in the PDF.")
        return (all_blocks, 'pyMuPDF')
    except Exception as e:
        print("The Error is", e.with_traceback())
        # return extract_text_with_Mistral_OCR(pdf_path)



In [163]:
pdf_path = r'D:\DATA300\AudioBookSum\pdf\Gilman.pdf'
words, extraction_method = extract_text(pdf_path)
# Dataframe box
df = pd.DataFrame(words)
df[:30]

Unnamed: 0,page,bbox,text,block_no,block_type,text_height,is_merged_cluster
0,0,"(36.0, 161.03799438476562, 402.4320068359375, ...","The Mad Man as Artist: Medicine, History and D...",1,9,15.216003,True
1,0,"(36.0, 177.03799438476562, 184.3200225830078, ...",Author(s): Sander Gilman,1,2,15.216003,True
2,0,"(36.0, 193.03799438476562, 557.1978759765625, ...","Source: Journal of Contemporary History, Vol. ...",1,12,15.216003,True
3,0,"(36.0, 209.03799438476562, 176.35198974609375,...","(Oct., 1985), pp. 575-597",1,3,15.216003,True
4,0,"(36.0, 225.03799438476562, 249.88803100585938,...","Published by: Sage Publications, Ltd.",1,4,15.216003,True
5,0,"(36.0, 241.03799438476562, 326.6040344238281, ...",Stable URL: https://www.jstor.org/stable/260398,1,2,15.216003,True
6,0,"(36.0, 257.0379943847656, 218.5919952392578, 2...",Accessed: 16-12-2024 15:09 UTC,1,3,15.216003,True
7,0,"(36.0, 295.96600341796875, 553.5723266601562, ...",range of content in a trusted digital archive....,4,17,11.411987,True
8,0,"(36.0, 327.96600341796875, 508.68017578125, 33...",facilitate new forms of scholarship. For more ...,5,12,11.411987,True
9,0,"(36.0, 359.96600341796875, 495.72930908203125,...",Your use of the JSTOR archive indicates your a...,7,17,11.411987,True


In [164]:

input_pdf_path = r'D:\DATA300\AudioBookSum\pdf\Gilman.pdf'
pdf_file = Path(input_pdf_path)
assert pdf_file.is_file()
base_name = os.path.splitext(input_pdf_path)[0]
output = f"{base_name}_highlighted.pdf"
print(f"Processing PDF: {input_pdf_path}")

# Extract text and bounding boxes
words, extract_method = extract_text(input_pdf_path)
print(extract_method)
# Keep track of the median word height

print(f"Extracted {len(words)} text blocks")
data = []
for curr_word in words:

    text = curr_word["text"].strip() if isinstance(curr_word.get("text"), str) else ""
    
    # Improved footnote detection
    footnote_markers = ["*", "†", "‡", "§", "¶", "⁂", "⁎", "⁑", "⁕"]
    # Text density in space
    # How to consistently identify the threshold
    is_foot_note_marker = False
    # Calculate the distance between the text and the space it occupies
    # x0,y0,x1,y1
    space = (curr_word["bbox"][2] - curr_word["bbox"][0]) * (curr_word["bbox"][3] - curr_word["bbox"][1])
    text_count = len(text.split(" "))
    data.append(
        {
            "text": text,
            "page": curr_word["page"],
            "space": space,
            "text_count": text_count,
            "text_height": curr_word["text_height"],
            "ratio": text_count / space if space > 0 else 0,
        }
    )
    is_footnote_marker = False
    # Classification logic
    if is_footnote_marker:
        classification = "footnote"
    else:
        classification = "main"

    curr_word["category"] = classification
    
df = pd.DataFrame(data)
df['ratio'].median()
lower_q = df['ratio'].quantile(0.0)
upper_q = df['ratio'].quantile(0.9)

middle_df = df[(df['ratio'] >= lower_q) & (df['ratio'] <= upper_q) & (df['text_count'] > 0)]
middle_df.sort_values(by='text_count', ascending=False, inplace=True)
middle_df
#Plot out the frequency of text height
# plt.figure(figsize=(10, 6))
# plt.hist(df["text_height"], bins=30, color='blue', alpha=0.7)
# plt.xlabel("Text Height")
# plt.ylabel("Frequency")
# plt.title("Histogram of Text Height")
# plt.grid(axis='y', alpha=0.75)
# plt.show()
# df
# plt.figure(figsize=(10, 6))
# plt.hist(df["ratio"], bins=30, color='blue', alpha=0.7)
# plt.xlabel("Text Count to Space Ratio")
# plt.ylabel("Frequency")
# plt.title("Histogram of Text Count to Space Ratio")
# plt.grid(axis='y', alpha=0.75)
# plt.show()

Processing PDF: D:\DATA300\AudioBookSum\pdf\Gilman.pdf
pyMuPDF
Extracted 274 text blocks


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  middle_df.sort_values(by='text_count', ascending=False, inplace=True)


Unnamed: 0,text,page,space,text_count,text_height,ratio
78,creations contain nightmares and cause one's h...,6,137434.595171,475,10.144012,0.003456
95,sideration of madness; what diagnostic criteri...,8,137138.954347,470,10.144012,0.003427
157,"unknown tongue, nor were they romantic artists...",14,136640.317536,463,10.144012,0.003388
86,"As early as Lombroso's work, it was evident th...",7,137231.284321,453,10.143997,0.003301
199,support may have been unwelcome to them in the...,18,135310.900887,453,10.143997,0.003348
...,...,...,...,...,...,...
262,Gilman:,23,578.883634,1,13.947998,0.001727
264,History,23,573.207044,1,13.947998,0.001745
263,"Medicine,",23,708.837044,1,13.947998,0.001411
265,and,23,259.599920,1,13.947998,0.003852


In [165]:
def classify_text_blocks(blocks, parsing_method: str):
    """Classify text blocks into main content, footnotes, and extra information."""
    if not blocks:
        print("No blocks to analyze")
        return None

    classified_blocks = []

    if parsing_method == 'Mistral OCR':
        # Implement classification for Mistral OCR if needed
        for block in blocks:
            text = block["text"].strip() if isinstance(block.get("text"), str) else ""
            
            # A placeholder for footnote detection (to be refined)
            is_footnote_marker = False  
            classification = "footnote" if is_footnote_marker else "main"

            block["category"] = classification
            classified_blocks.append(block)

    elif parsing_method == 'pyMuPDF':
        # First pass: Compute a ratio for each block to later determine the middle range
        ratios = []
        for block in blocks:
            text = block["text"].strip() if isinstance(block.get("text"), str) else ""
            bbox = block.get("bbox", [0, 0, 0, 0])
            text_height = block.get("text_height", 0)
            # Calculate area of the bounding box (space)
            space = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
            text_count = len(text.split())
            ratio = (text_count / space) if space > 0 else 0
            ratios.append(ratio)

        # Calculate the lower and upper quantiles for the middle 25% - adjust as needed.
        # Here, we use 15% and 40% quantiles from your earlier example.
        lower_q = pd.Series(ratios).quantile(0.25)
        upper_q = pd.Series(ratios).quantile(0.75)

        # Second pass: Classify each block using the ratio thresholds and footnote checks.
        for idx, block in enumerate(blocks):
            text = block["text"].strip() if isinstance(block.get("text"), str) else ""
            bbox = block.get("bbox", [0, 0, 0, 0])
            space = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
            text_count = len(text.split())
            ratio = (text_count / space) if space > 0 else 0

            # Improved footnote detection
            footnote_markers = ["*", "†", "‡", "§", "¶", "⁂", "⁎", "⁑", "⁕"]
            is_footnote_marker = (
                any(text.startswith(prefix) for prefix in footnote_markers) or
                any(text.startswith(str(i)) for i in range(1, 10))
            )

            # Classification logic:
            # - If it's a footnote marker, mark as "footnote".
            # - Otherwise, if the block falls within the middle thresholds and has sufficient words, it is "main".
            # - Else, it is "non-main"
            if is_footnote_marker:
                classification = "footnote"
            elif lower_q <= ratio <= upper_q and text_count > 20 and text_height > 0:
                classification = "main"
            else:
                classification = "extra"

            block["category"] = classification
            classified_blocks.append(block)

    return classified_blocks

Finding the Threshold

In [166]:
def chunk_text(blocks, chunk_size:int=1000, chunk_overlap:int=100):
    """Chunk text blocks into smaller pieces for processing.
        Args:
            chunk_size (int): Size of each chunk. Default is 1000 characters.
            chunk_overlap (int): Number of overlapping characters between chunks. Default is 200.
    """
    # Separate blocks by category
    categories = {"main": [], "footnote": [], "extra": []}
    for block in blocks:
        categories[block["category"]].append(block)
    
    chunked_data = {}
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    
    for category, category_blocks in categories.items():
        # Extract text from blocks
        texts = [block["text"] for block in category_blocks]
        full_text = "\n\n".join(texts)
        
        # Chunk the text
        chunks = text_splitter.split_text(full_text)
        chunked_data[category] = chunks
    
    return chunked_data


In [167]:
def identify_interesting_points(chunked_data, llm, blocks, file_name):
    """Use an LLM to identify interesting or important points in the text."""
    interesting_sections = []
    
    # Only process the 'main' category
    if 'main' not in chunked_data:
        print("No main content identified in the document")
        return interesting_sections
        
    prompt_template = PromptTemplate(
        input_variables=["text"],
        template="""
        You are a reasoning summarizer.
        Summarize the provided text and support your summary using at least 20 verbatim snippets from the original text.
        Remember:
        The reasoning section must ONLY contain verbatim text from the document
        Every sentence in the reasoning must be supporting sentences in the summary section
        Do not add any information that isn't directly from the documentFormat:
        ```segment 1```
        ```segment 2```
        Below is text from the main content of a document in English:
        {text}
        """
    )
    
    # for chunk in chunked_data['main']:
    chunk = "\n".join(chunked_data['main'])
    
    prompt = prompt_template.format(text=chunk)
    interesting_points = []
    
    if os.path.exists(file_name):
        with open(file_name, "r", encoding="utf-8") as f:
            for line in f.readlines():
                if not len(line.strip()):
                    continue
                interesting_points.append(line)
            for segment in interesting_points:
                threshold = threshold = len(segment.split(" "))
                segment = segment.strip()
                if len(segment.split()) < threshold:
                    continue
                for block in blocks:
                    intersect_more_more_than_threshold = False
                    if len(segment.split()) >= threshold:
                        count = 0
                        for chip in segment.split(" "):
                            if chip in block["text"]:
                                count += 1
                        # print("count:", count, "len:", len(segment.split()), "threshold:", len(segment.split())*(1/threshold), "seg:", segment)
                        if count >= len(segment.split(" "))*(threshold/(threshold + 1)):
                            intersect_more_more_than_threshold = True
                            
                                
                    if (segment in block["text"] or intersect_more_more_than_threshold):
                        interesting_sections.append({
                            "page": block["page"],
                            "text": segment if segment in block["text"] else block["text"],
                            "category": "main",
                            "bbox": block["bbox"]
                        })
                        # print(segment)
    if not interesting_points:
        try:
            response = llm.invoke(prompt)
            # Extract segments between triple backticks
            import re
            segments = re.findall(r'```(.*?)```', response.content, re.DOTALL)
            
            for segment in segments:
                segment = segment.strip()
                # print(segment)
                for block in blocks:
                    if segment in block["text"] and block["category"] == "main":
                        # print(block["page"])
                        interesting_sections.append({
                            "page": block["page"],
                            "text": segment,
                            "category": "main",
                            "bbox": block["bbox"]
                        })
                        break
            with open(file_name, "w", encoding="utf-8") as f:
                f.write("\n\n".join(segments))
        except Exception as e:
            print(f"Error during interesting point extraction: {e}")
        
    return interesting_sections


In [None]:
def highlight_interesting_points(pdf_path, interesting_points, output_path):
    """Add highlights to the interesting points in the PDF with improved fuzzy matching."""
    doc = pymupdf.open(pdf_path)
    
    # Using cyan highlight color for main content
    highlight_color = (0, 1, 1)  # RGB for cyan
    fail_count = 0
    success_count = 0
    
    for point in interesting_points:
        page = doc[point["page"]]
        text = point["text"]
        if not text:
            print(f"Empty text for page {point['page']}")
            continue
            
        # Try exact match first
        text_instances = page.search_for(text)
        
        # If exact match fails, try with these fallback methods:
        if not text_instances:
            # Method 1: Try normalized text (remove extra whitespace)
            normalized_text = ' '.join(text.split())
            text_instances = page.search_for(normalized_text)
            
            # Method 2: Try with key phrases (for longer text segments)
            if not text_instances and len(normalized_text.split()) > 10:
                # Extract significant phrases (5-8 words)
                words = normalized_text.split()
                for i in range(len(words) - 5):
                    phrase = ' '.join(words[i:i+min(8, len(words)-i)])
                    if len(phrase) > 15:  # Only phrases with enough content
                        phrase_instances = page.search_for(phrase)
                        if phrase_instances:
                            text_instances = phrase_instances
                            break
            
            # Method 3: Use key sentences if text contains multiple sentences
            if not text_instances and '.' in normalized_text:
                sentences = [s.strip() for s in normalized_text.split('.') if len(s.strip()) > 15]
                for sentence in sentences:
                    sentence_instances = page.search_for(sentence)
                    if sentence_instances:
                        text_instances = sentence_instances
                        break
        
        # Highlight found instances or use bbox as fallback
        if text_instances:
            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)
                highlight.set_colors(stroke=highlight_color)
                highlight.update()
            success_count += 1
        elif "bbox" in point:
            r = page.add_highlight_annot(point["bbox"])
            r.set_colors(stroke=highlight_color)    
            r.update()
            success_count += 1
        else:
            fail_count += 1
    
    print(f"Successfully highlighted {success_count} segments")
    print(f"Failed to highlight {fail_count} segments")
    
    # Save the highlighted PDF
    doc.save(output_path)
    doc.close()

In [173]:

input_pdf_path = r'D:\DATA300\AudioBookSum\pdf\Prinzhorn.pdf'
pdf_file = Path(input_pdf_path)
assert pdf_file.is_file()
base_name = os.path.splitext(input_pdf_path)[0]
output = f"{base_name}_highlighted.pdf"


# Initialize LLM
try:
    llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0.7)
except Exception as e:
    print(f"Error initializing Gemini LLM: {e}")
    print("Make sure you have set GOOGLE_API_KEY in your environment or .env file")
    exit(1)

print(f"Processing PDF: {input_pdf_path}")

# Extract text and bounding boxes
words, extract_method = extract_text(input_pdf_path)
print(extract_method)

print(f"Extracted {len(words)} text blocks")

# Classify text blocks
classified_blocks = classify_text_blocks(words, extract_method)
print("Classified text blocks")
# Considering Chunking before classification
# with open('content_extra.txt', "w", encoding="utf-8") as f:
#     f.write(" ".join([block["text"] for block in classified_blocks if block["category"] == "extra"]))
# with open('content.txt', "w", encoding="utf-8") as f:
#     f.write(" ".join([block["text"] for block in classified_blocks if block["category"] == "main"]))
# with open('content_footnote.txt', "w", encoding="utf-8") as f:
#     f.write(" ".join([block["text"] for block in classified_blocks if block["category"] == "footnote"]))
# # Chunk text
chunked_data = chunk_text(classified_blocks)
print("Chunked text for processing")

# # Identify interesting points (main content only)
interesting_points = identify_interesting_points(chunked_data, llm, classified_blocks, f"{base_name}_interesting_points.txt")  # Save to file
# load from memory
print(f"Identified {len(interesting_points)} interesting points in main content")
print(interesting_points)
# # Highlight interesting points in the PDF
highlight_interesting_points(input_pdf_path, interesting_points, output)
print(f"Created highlighted PDF: {output}")

print("Processing complete!")

Processing PDF: D:\DATA300\AudioBookSum\pdf\Prinzhorn.pdf
pyMuPDF
Extracted 120 text blocks
Classified text blocks
Chunked text for processing
Identified 9 interesting points in main content
[{'page': 1, 'text': 'Some months ago, when I was visiting\nmemoire du Docteur P. F. Gachet-1828- cent Van Gogh, passed almost all of the Saint Paul, in the environs of the little town The asylum to which I refer is that of\ncent Van Gogh," with a dedication "a la the medical and artistic documents of the last year of his life, 1889-1890. The pre-\nof his colleague, V. Doiteau, had collected an insane asylum of sorrowful memories.\nceding winter, the physician in charge of English friend, it occurred to me that not\nthe asylum, Dr. Edgar Leroy, with the aid far from the place where I was staying was\ndistinguished forerunner of modern art, Vin- of Saint-Remy-de-Provence, where that most southern France as guest on the estate of an\na serious, splendid book, "La Folie de Vin- period of Van Gogh\'s r