In [16]:
import networkx as nx
from pyvis.network import Network
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
import os
import re
import sys
import json 


def ocr_force(doc):
    """
    Process a PDF document and apply OCR to pages with no text blocks.
    For pages that need OCR, creates a structured text dictionary.
    For pages with existing text, keeps the original page.
    Returns a list containing processed pages.
    """
    final_doc = []
    page_num = 0
    for page in doc:
        page_num += 1
        full_ocr_text = ""
        # Check if the page has text blocks
        need_ocr = False
        blocks = page.get_text("dict")["blocks"]
        if blocks == []:
            need_ocr = True
        if need_ocr:
            pixmap = page.get_pixmap()
            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
            try:
                page_text = pytesseract.image_to_string(img)
                full_ocr_text += f"==Start of OCR for page {page_num + 1}==\n"
                full_ocr_text += page_text
                full_ocr_text += f"\n==End of OCR for page {page_num + 1}==\n\n"
                # print(f"Page {page_num + 1} OCR text: {page_text}")
                
                # Create a structured representation of the OCR text
                ocr_result = {
                    "text": page_text,
                    "font": "OCR-detected",
                    "font_size": 10.0,  # Default font size
                    "is_bold": False,   # We can't detect bold with basic OCR
                    "is_underlined": False,  # We can't detect underlines with basic OCR
                    "page": page_num,
                    "source": "ocr"     # Mark as OCR source for reference
                }
                final_doc.append(ocr_result)
            except Exception as e:
                print(f"Error during OCR on page {page_num + 1}: {e}")
                # Add an empty result to keep page ordering consistent
                final_doc.append({
                    "text": f"[OCR ERROR on page {page_num + 1}]",
                    "font": "OCR-error",
                    "font_size": 10.0,
                    "is_bold": False,
                    "is_underlined": False,
                    "page": page_num,
                    "source": "ocr-error"
                })
        else:
            final_doc.append(page)
    
    return final_doc

def extract_structured_text_from_pdf(pdf_path):
    """
    Extract text from PDF with formatting information using PyMuPDF.
    Uses geometric analysis to detect underlined text.
    Returns a list of dictionaries with text and its formatting properties.
    """
    try:
        # print(f"Opening PDF file: {pdf_path}")
        doc = fitz.open(pdf_path)
        # print(f"PDF has {len(doc)} pages")
        
        # Process document with OCR for pages that need it
        processed_doc = ocr_force(doc)
        
        structured_text = []
        
        # First pass: determine maximum font size for reference
        max_font_size = 0
        for page in doc:
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            max_font_size = max(max_font_size, span["size"])
        # print(f"Maximum font size detected: {max_font_size}")
        
        # Process each page: extract text spans and detect underlines via drawing objects.
        for page_num, item in enumerate(processed_doc):
            # Check if this is an OCR result (dictionary) or a regular page
            if isinstance(item, dict) and "source" in item and item["source"] in ["ocr", "ocr-error"]:
                # This is an OCR result, add it directly to structured text
                structured_text.append(item)
                continue            # Check if this is an OCR result (dictionary) or a regular page
            if isinstance(item, dict) and "source" in item and item["source"] in ["ocr", "ocr-error"]:
                # This is an OCR result, add it directly to structured text
                structured_text.append(item)
                continue
                
            # This is a regular page, process it normally
            page = item
            
            # Get drawing objects and filter for those that appear to be underlines.
            drawings = page.get_drawings()
            underline_rects = []
            for d in drawings:
                # Look for filled rectangles (type 'f') which might be drawn as underlines.
                if d.get("type") == "f":
                    for item in d.get("items", []):
                        if item[0] == "re":
                            rect = item[1]
                            # Heuristic: if the rectangle is very short in height, consider it an underline.
                            if rect.height < 5:
                                underline_rects.append(rect)
            
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        line_text = ""
                        is_bold = False
                        is_underlined = False
                        font_size = 0
                        font_used = ""
                        span_bboxes = []
                        
                        # Process each span within the line.
                        for span in line["spans"]:
                            if "bold" in span["font"].lower():
                                is_bold = True
                            line_text += span["text"] + " "
                            font_size = max(font_size, span["size"])
                            font_used = span["font"]
                            span_bboxes.append(span["bbox"])
                        
                        # Compute the union of the bounding boxes for the whole line.
                        if span_bboxes:
                            x0 = min(b[0] for b in span_bboxes)
                            y0 = min(b[1] for b in span_bboxes)
                            x1 = max(b[2] for b in span_bboxes)
                            y1 = max(b[3] for b in span_bboxes)
                            line_bbox = (x0, y0, x1, y1)
                        else:
                            line_bbox = None
                        
                        # Heuristic: if any underline rectangle overlaps horizontally
                        # and its top is within 5 units of the text bbox bottom, mark as underlined.
                        if line_bbox:
                            for rect in underline_rects:
                                overlap = min(line_bbox[2], rect.x1) - max(line_bbox[0], rect.x0)
                                if overlap > 0 and abs(rect.y0 - line_bbox[3]) < 5:
                                    is_underlined = True
                                    break
                        
                        line_text = line_text.strip()
                        if line_text:
                            structured_text.append({
                                "text": line_text,
                                "font": font_used,
                                "font_size": font_size,
                                "is_bold": is_bold,
                                "is_underlined": is_underlined,
                                "page": page_num + 1,
                                "source": "text"  # Mark as normal text extraction
                            })
          # print(f"Extracted {len(structured_text)} text elements")
        return structured_text
    except Exception as e:
        print(f"Error extracting PDF text: {e}")
        raise e

In [17]:
extract_structured_text_from_pdf(r'C:\Users\bilas\OneDrive\Documents\GENAI\my_web\Pathway_MidEval_Report.pdf')

[{'text': 'Dynamic Agentic RAG with Pathway',
  'font': 'NimbusRomNo9L-Medi',
  'font_size': 14.346199989318848,
  'is_bold': False,
  'is_underlined': False,
  'page': 1,
  'source': 'text'},
 {'text': 'Team 67',
  'font': 'NimbusRomNo9L-Medi',
  'font_size': 9.962599754333496,
  'is_bold': False,
  'is_underlined': False,
  'page': 1,
  'source': 'text'},
 {'text': 'Abstract',
  'font': 'NimbusRomNo9L-Medi',
  'font_size': 11.9552001953125,
  'is_bold': False,
  'is_underlined': False,
  'page': 1,
  'source': 'text'},
 {'text': 'In this work, we aim to develop an End-to-End',
  'font': 'NimbusRomNo9L-Regu',
  'font_size': 10.061732292175293,
  'is_bold': False,
  'is_underlined': False,
  'page': 1,
  'source': 'text'},
 {'text': 'Agentic RAG system utilizing Pathway’s real-',
  'font': 'NimbusRomNo9L-Regu',
  'font_size': 10.061732292175293,
  'is_bold': False,
  'is_underlined': False,
  'page': 1,
  'source': 'text'},
 {'text': 'time data processing capabilities. We identify',
  

In [12]:
!pip install PyMuPDF Pillow pytesseract



In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os

# !! IMPORTANT !!
# You might need to set the path to the Tesseract executable
# if it's not in your system's PATH.
# For example (Windows):
# pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# For example (Linux):
# pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
# Check your Tesseract installation path and uncomment/modify the line below if needed.
# pytesseract.tesseract_cmd = r'YOUR_TESSERACT_PATH\tesseract.exe' # <--- Modify this if needed

def ocr_pdf(pdf_path, txt_path):
    """
    Performs OCR on each page of a PDF and saves the extracted text.

    Args:
        pdf_path (str): The path to the input PDF file.
        txt_path (str): The path to the output text file.
    """
    doc = None # Initialize doc to None
    try:
        # Open the PDF document
        doc = fitz.open(pdf_path)
        full_ocr_text = ""

        print(f"Starting OCR for '{pdf_path}'...")

        # Iterate through each page
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)  # Load the current page

            # Convert the page to an image (pixmap)
            # You can adjust dpi for higher resolution, e.g., scale=2 for 144 dpi
            pixmap = page.get_pixmap()

            # Convert the pixmap to a PIL Image
            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)

            # Perform OCR on the image using pytesseract
            try:
                page_text = pytesseract.image_to_string(img)
                print(f"Processed page {page_num + 1}/{doc.page_count}")

                # Add markers like you provided
                full_ocr_text += f"==Start of OCR for page {page_num + 1}==\n"
                full_ocr_text += page_text
                full_ocr_text += f"\n==End of OCR for page {page_num + 1}==\n\n"

            except pytesseract.TesseractNotFoundError:
                 print("Error: Tesseract is not installed or not in your PATH.")
                 print("Please install Tesseract OCR engine and/or set the pytesseract.tesseract_cmd path.")
                 return
            except Exception as e:
                 print(f"An error occurred during OCR on page {page_num + 1}: {e}")
                 # Optionally, skip this page or stop

        # Write the extracted text to a .txt file
        with open(txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(full_ocr_text)

        print(f"Successfully extracted text from '{pdf_path}' to '{txt_path}' using OCR.")

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        print("Please check the input PDF path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Ensure the document is closed
        if doc:
             doc.close()

# --- Configuration ---
# !! IMPORTANT !!
# Replace 'your_input_file.pdf' with the actual path to your PDF file.
# Make sure the PDF file is accessible from where you run the script.
input_pdf_file = 'sdg.pdf'

# Replace 'output_ocr_text_file.txt' with the desired name for the output text file.
output_text_file = 'output_ocr_text_file.txt'

# --- Run the OCR conversion ---
if __name__ == "__main__":
    if os.path.exists(input_pdf_file):
        ocr_pdf(input_pdf_file, output_text_file)
    else:
        print(f"Error: Input PDF file not found at '{input_pdf_file}'.")
        print("Please update the 'input_pdf_file' variable with the correct path.")

Starting OCR for 'sdg.pdf'...
Processed page 1/20
Processed page 2/20
Processed page 3/20
Processed page 4/20
Processed page 5/20
Processed page 6/20
Processed page 7/20
Processed page 8/20
Processed page 9/20
Processed page 10/20
Processed page 11/20
Processed page 12/20
Processed page 13/20
Processed page 14/20
Processed page 15/20
Processed page 16/20
Processed page 17/20
Processed page 18/20
Processed page 19/20
Processed page 20/20
Successfully extracted text from 'sdg.pdf' to 'output_ocr_text_file.txt' using OCR.
