In [11]:
import os
from dotenv import load_dotenv

import pymupdf
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

load_dotenv()

PDF_PATH = "data/sample.pdf"
IMAGES_DIR = "data/images"

# Create images directory if it doesn't exist
os.makedirs(IMAGES_DIR, exist_ok=True)

In [12]:
# Extract and save images from the PDF
doc = pymupdf.open(PDF_PATH)
for page_num, page in enumerate(doc):
    images = page.get_images()
    for img_idx, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        
        image_path = f"{IMAGES_DIR}/page{page_num}_img{img_idx}.{image_ext}"
        with open(image_path, "wb") as f:
            f.write(image_bytes)
        print(f"Saved: {image_path}")
doc.close()

Saved: data/images/page0_img0.jpeg
Saved: data/images/page1_img0.png
Saved: data/images/page2_img0.jpeg
Saved: data/images/page2_img1.jpeg
Saved: data/images/page3_img0.png
Saved: data/images/page3_img1.png
Saved: data/images/page4_img0.png
Saved: data/images/page4_img1.png
Saved: data/images/page4_img2.png


In [13]:
# Load documents with OCR for images
parser = PyMuPDF4LLMLoader(
    file_path=PDF_PATH,
    mode="page" 
)
docs = parser.lazy_load()
docs = list(docs)
print(f"\nTotal pages: {len(docs)}")


Total pages: 10


In [16]:
docs[1].page_content

"Pitch to Raw Fury 2\n\n\n**What is this game?**\n\n_My Work Is Not Yet Done_ is a narrative-driven investigative horror game, combining\nelements of the survival/simulation genres with a dense, nonlinear plot exploring the\nimbrication and dissolution of human identities/meanings within uncanny wilderness.\n\n_Work_ follows the final days of the inquisitor Avery, the last remaining survivor of a\ndoomed excursionary mission into a remote and unaccountable stretch of old\ncountry. Tasked with tracking down the source of an inscrutable signal, she must\nnavigate, survey and survive an eerie, ever-shifting landscape reclaimed by nature\n— and perhaps something stranger.\n\nIt is currently scheduled for release in 2021/2022 for Windows, across Steam ($20),\nitch.io ($20 recommended, pay-what-you-can), and GitHub (open-source). It is being\ndeveloped in GameMaker Studio 2.\n\n[You can view a trailer here, and a monthly development log here.](https://www.youtube.com/watch?v=KmwlUFxTwc0)\n\n