In [1]:
pip install --upgrade pymupdf

Requirement already up-to-date: pymupdf in /home/lennaert/.local/lib/python3.8/site-packages (1.23.6)
Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz

In [3]:
doc = fitz.open("../resources/testpdf/marx.pdf", filetype="pdf")

In [4]:
"""
Read a document page by page
"""
pages = doc.pages()

# This variable would be used if we had a method of identifying what 
# we wanted to redact for each page. 
redaction_annotations = {
    # page: [an1, an2...]
    1: [...],
    2: [...]
}

for page in pages:
    # search for annotations
    print(page)

page 0 of ../resources/testpdf/marx.pdf


In [120]:
"""
Function that extract the xref of an object in case of 
only one reference by object.
"""
def getObjectNum(xref):
    temp = xref[1].strip('[]').split()
    temp = [item.strip(",") for item in temp]
    return int(temp[0])

In [121]:
"""
Get Root object of doc (start point of reading a document)
"""
root = {}

# recurring structure of getting key-value pairs of a dictonary of 
# an object.
root_xref = doc.pdf_catalog()
for key in doc.xref_get_keys(root_xref):
    root[key] = doc.xref_get_key(root_xref, key)

print(root)

{'StructTreeRoot': ('xref', '3 0 R'), 'MarkInfo': ('dict', '<</Marked true>>'), 'Pages': ('xref', '4 0 R'), 'Type': ('name', '/Catalog'), 'PageLabels': ('dict', '<</Nums[0<</S/D>>]>>'), 'OutputIntents': ('array', '[5 0 R]'), 'PageMode': ('name', '/UseNone'), 'Lang': ('string', 'NL'), 'OpenAction': ('array', '[6 0 R/XYZ 0 841.9 0]'), 'Metadata': ('xref', '1 0 R')}


In [122]:
"""
Get Pages object of doc
"""
xref = root['Pages']

pages = {}
xref = getObjectNum(xref)
for key in doc.xref_get_keys(xref):
    pages[key] = doc.xref_get_key(xref, key)

print(pages)

{'Type': ('name', '/Pages'), 'ITXT': ('string', '4.1.6'), 'Kids': ('array', '[6 0 R]'), 'Count': ('int', '1')}


In [123]:
"""
Extract object array function
"""

def extractObjectArray(array):
    temp = array.strip('[]').split()
    temp = [item.strip(",") for item in temp]
    pdf_pages_xref = temp[::3]
    pdf_pages_xref = [int(x) for x in pdf_pages_xref]
    return pdf_pages_xref

In [124]:
"""
Get all xref of pdf pages
"""
import ast

page_object = pages['Kids']

pdf_pages_xref = []

if page_object[0] == 'array':
    pdf_pages_xref = extractObjectArray(page_object[1])
    
print(pdf_pages_xref)

[6]


In [125]:
"""
Extract info about the pages in the PDF
"""

pdf_pages = {}

for page_xref in pdf_pages_xref:
    print(page_xref)
    fields = {}
    for key in doc.xref_get_keys(page_xref):
        fields[key] = doc.xref_get_key(page_xref, key)
    pdf_pages[page_xref] = fields
    
print(pdf_pages)

6
{6: {'StructParents': ('int', '0'), 'Type': ('name', '/Page'), 'BleedBox': ('array', '[0 0 595.276 841.89]'), 'Contents': ('array', '[16 0 R 17 0 R]'), 'CropBox': ('array', '[0 0 595.276 841.89]'), 'MediaBox': ('array', '[0 0 595.276 841.89]'), 'Parent': ('xref', '4 0 R'), 'Resources': ('xref', '18 0 R'), 'TrimBox': ('array', '[0 0 595.276 841.89]')}}


In [126]:
"""
Extract info about resources used by page
"""
# in this case: the first page of the 'pdf_pages' list.
page = pdf_pages[6]

resources_xref = []

resources = page['Resources']
if resources[0] == 'xref':
    resources_xref = [getObjectNum(resources)]

page_resources = {}
if len(resources_xref) == 1:
    fields = {}
    for key in doc.xref_get_keys(resources_xref[0]):
        fields[key] = doc.xref_get_key(resources_xref[0], key)
        
    page_resources[resources_xref[0]] = fields
else:
    pass

print(page_resources)


{18: {'Font': ('dict', '<</F3 19 0 R/F2 20 0 R/F1 21 0 R/F0 22 0 R>>'), 'ProcSet': ('array', '[/PDF/ImageC/Text]'), 'XObject': ('dict', '<</I0 23 0 R>>')}}


In [180]:
"""
Extract fonts used by page
"""

page_fonts = {}

resource = page_resources[18]
fonts = resource['Font']
if fonts[0] == 'dict':
    fonts_ref = fonts[1].strip("<<>>").split("/")[1:]
    for font_ref in fonts_ref:
        tokens = font_ref.split(" ")
        page_fonts[tokens[0]] = int(tokens[1])
else:
    print("ERROR")

print(page_fonts, "\n")

# iterate every font
page_fonts_info = {}
for font_xref in page_fonts.items():
    fields = {}
    for key in doc.xref_get_keys(font_xref[1]):
        fields[key] = doc.xref_get_key(font_xref[1], key)
    
    page_fonts_info[font_xref[0]] = fields
    
print(page_fonts_info, "\n")

# take for example font 'F3'
font_f3 = page_fonts_info['F1']
f3_toUnicode = {}
f3_descendant = {}

# get to unicode
toUnicode_xref = font_f3['ToUnicode']
if toUnicode_xref[0] == 'xref':
    toUnicode_xref = getObjectNum(toUnicode_xref)
    fields = {}
    for key in doc.xref_get_keys(toUnicode_xref):
        fields[key] = doc.xref_get_key(toUnicode_xref, key)
    if stream := doc.xref_stream(toUnicode_xref):
        fields["Stream"] = stream.decode("latin-1")
    f3_toUnicode['F3'] = fields

print(f3_toUnicode, "\n")

# get font descendent
descendent_xref = font_f3['DescendantFonts']
if descendent_xref[0] == 'array':
    descendent_xref = extractObjectArray(descendent_xref[1])[0]
    fields = {}
    for key in doc.xref_get_keys(descendent_xref):
        fields[key] = doc.xref_get_key(descendent_xref, key)
    if stream := doc.xref_stream(descendent_xref):
        fields["Stream"] = stream.decode("Identity-H")
    f3_descendant['F3'] = fields

print(f3_descendant, "\n")

{'F3': 19, 'F2': 20, 'F1': 21, 'F0': 22} 

{'F3': {'Encoding': ('name', '/Identity-H'), 'Type': ('name', '/Font'), 'BaseFont': ('name', '/UniversLT-Condensed'), 'Subtype': ('name', '/Type0'), 'DescendantFonts': ('array', '[34 0 R]'), 'ToUnicode': ('xref', '35 0 R')}, 'F2': {'Encoding': ('name', '/Identity-H'), 'Type': ('name', '/Font'), 'BaseFont': ('name', '/UniversLT-CondensedBold'), 'Subtype': ('name', '/Type0'), 'DescendantFonts': ('array', '[32 0 R]'), 'ToUnicode': ('xref', '33 0 R')}, 'F1': {'Encoding': ('name', '/Identity-H'), 'Type': ('name', '/Font'), 'BaseFont': ('name', '/UniversLT'), 'Subtype': ('name', '/Type0'), 'DescendantFonts': ('array', '[30 0 R]'), 'ToUnicode': ('xref', '31 0 R')}, 'F0': {'Encoding': ('name', '/Identity-H'), 'Type': ('name', '/Font'), 'BaseFont': ('name', '/UniversLT-Bold'), 'Subtype': ('name', '/Type0'), 'DescendantFonts': ('array', '[26 0 R]'), 'ToUnicode': ('xref', '27 0 R')}} 

{'F3': {'Length': ('int', '415'), 'Filter': ('name', '/FlateDecode'),

In [None]:
"""
Parse mapping of 
"""

In [172]:
"""
Extract contents for given page
"""
# in this case: the first page of the 'pdf_pages' list.

page = pdf_pages[6]

contents = page['Contents']

# Store the xrefs of contents
contents_xref = []

if contents[0] == 'array':
    contents_xref = extractObjectArray(contents[1])

print(contents_xref)

# Get info about the contents of this page
page_contents = {}
for content_xref in contents_xref:
    fields = {}
    for key in doc.xref_get_keys(content_xref):
        fields[key] = doc.xref_get_key(content_xref, key)
    if stream := doc.xref_stream(content_xref):
        fields["Stream"] = stream.decode("latin")
    page_contents[content_xref] = fields
    
print(page_contents)

[16, 17]
{16: {'Length': ('int', '44'), 'Filter': ('name', '/FlateDecode'), 'Stream': 'q\n0 0 m\n595.276 0 l\n595.276 841.89 l\n0 841.89 l\nh\nW n\n'}, 17: {'Length': ('int', '2545'), 'Filter': ('name', '/FlateDecode'), 'Stream': '0 Tw\n0 Tc\n0 G\n0 g\n/Block <</MCID 0>>\nBDC\nBT\n/F0 12 Tf\n1 0 0 1 119.055 721.212 Tm\n(\\000$\\000D\\000Q)Tj\n1 0 0 1 141.483 721.212 Tm\n(\\000Y\\000U)Tj\n1 0 0 1 152.583 721.212 Tm\n(\\000D\\000D\\000J)Tj\n1 0 0 1 173.259 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 176.595 721.212 Tm\n(\\000Z)Tj\n1 0 0 1 187.023 721.212 Tm\n(\\000D\\000W)Tj\n1 0 0 1 198.123 721.212 Tm\n(\\000H\\000U\\000Y)Tj\n1 0 0 1 215.895 721.212 Tm\n(\\000H\\000U)Tj\n1 0 0 1 226.935 721.212 Tm\n(\\000J\\000X\\000Q\\000Q\\000L\\000Q\\000J)Tj\n1 0 0 1 274.263 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 277.599 721.212 Tm\n(\\000Y)Tj\n1 0 0 1 284.031 721.212 Tm\n(\\000R\\000R\\000U)Tj\n1 0 0 1 303.363 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 306.699 721.212 Tm\n(\\000K\\000H\\000W)Tj\n1 0 0 1 325.371 721.2

In [196]:
"""
Extract text elements of the content stream
"""
import re
import codecs

# lets use the second content stream of 'page_contents'

content_stream = page_contents[17]["Stream"]

matches = re.findall(r'BT(.*?)ET',content_stream, re.DOTALL)
for match in matches:
    decoded_text = codecs.escape_decode(match.strip())[0].decode('latin-1', errors='replace')
    print(decoded_text)
    # do something with decoded text: make it a dictonary?


/F0 12 Tf
1 0 0 1 119.055 721.212 Tm
( $ D Q)Tj
1 0 0 1 141.483 721.212 Tm
( Y U)Tj
1 0 0 1 152.583 721.212 Tm
( D D J)Tj
1 0 0 1 173.259 721.212 Tm
( )Tj
1 0 0 1 176.595 721.212 Tm
( Z)Tj
1 0 0 1 187.023 721.212 Tm
( D W)Tj
1 0 0 1 198.123 721.212 Tm
( H U Y)Tj
1 0 0 1 215.895 721.212 Tm
( H U)Tj
1 0 0 1 226.935 721.212 Tm
( J X Q Q L Q J)Tj
1 0 0 1 274.263 721.212 Tm
( )Tj
1 0 0 1 277.599 721.212 Tm
( Y)Tj
1 0 0 1 284.031 721.212 Tm
( R R U)Tj
1 0 0 1 303.363 721.212 Tm
( )Tj
1 0 0 1 306.699 721.212 Tm
( K H W)Tj
1 0 0 1 325.371 721.212 Tm
( )Tj
1 0 0 1 328.707 721.212 Tm
( X L W Y)Tj
1 0 0 1 350.475 721.212 Tm
( R H U)Tj
1 0 0 1 368.787 721.212 Tm
( H Q)Tj
1 0 0 1 382.791 721.212 Tm
( )Tj
1 0 0 1 386.127 721.212 Tm
( Y)Tj
1 0 0 1 392.319 721.212 Tm
( D Q)Tj
1 0 0 1 406.323 721.212 Tm
( )Tj
1 0 0 1 409.659 721.212 Tm
( Z)Tj
1 0 0 1 420.087 721.212 Tm
( D W)Tj
1 0 0 1 431.187 721.212 Tm
( H U)Tj
1 0 0 1 442.047 721.212 Tm
( K X L V K R X G N X Q G L J)Tj
1 0 0 1 534.471 721.212 

  decoded_text = codecs.escape_decode(match.strip())[0].decode('latin-1', errors='replace')


In [214]:
pages = doc.pages()
for page in pages:
    res = page.search_for('0652568389')
    
    # translate from PyMuPDF system to pdf system
    m = page.transformation_matrix
    print(m)
    print(res[0], res[0] * m)
    
    # check if previously extracted content stream contains
    # item contained within this rect.

Matrix(1.0, 0.0, 0.0, -1.0, 0.0, 841.8900146484375)
Rect(409.5469970703125, 149.40919494628906, 459.9205322265625, 160.25401306152344) Rect(409.5469970703125, 681.635986328125, 459.9205322265625, 692.4808349609375)
