In [5]:
import fitz 
import io
from PIL import Image
from pathlib import Path

# file path you want to extract images from
filename = "2307.09288"
file = Path(f"data/raw/{filename}/{filename}.pdf")

# create output directory
output_dir = Path("data/output") / filename / "images"
output_dir.mkdir(parents=True, exist_ok=True)

# open the file
pdf_file = fitz.open(file)

# iterate over PDF pages
for page_index in range(len(pdf_file)):
    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()

    # printing number of images found in this page 
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else: 
        print("[!] No images found on page", page_index)
    
    for image_index, img in enumerate(image_list, start=1):
        # get the XREF of the image
        xref = img[0]

        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
  
        # get the image extension
        image_ext = base_image["ext"]

        # save the image
        image_filename = f"page{page_index+1}_img{image_index}.{image_ext}"
        image_path = output_dir / image_filename
        with open(image_path, "wb") as image_file:
            image_file.write(image_bytes)
        
        print(f"    Saved image {image_filename}")

pdf_file.close()


[!] No images found on page 0
[!] No images found on page 1
[+] Found a total of 2 images in page 2
    Saved image page3_img1.jpeg
    Saved image page3_img2.png
[+] Found a total of 1 images in page 3
    Saved image page4_img1.png
[+] Found a total of 1 images in page 4
    Saved image page5_img1.jpeg
[!] No images found on page 5
[!] No images found on page 6
[!] No images found on page 7
[!] No images found on page 8
[!] No images found on page 9
[!] No images found on page 10
[!] No images found on page 11
[!] No images found on page 12
[!] No images found on page 13
[!] No images found on page 14
[+] Found a total of 2 images in page 15
    Saved image page16_img1.png
    Saved image page16_img2.png
[!] No images found on page 16
[!] No images found on page 17
[+] Found a total of 1 images in page 18
    Saved image page19_img1.png
[!] No images found on page 19
[+] Found a total of 1 images in page 20
    Saved image page21_img1.png
[!] No images found on page 21
[!] No images 