In [1]:
pip install --upgrade pymupdf

Requirement already up-to-date: pymupdf in /home/lennaert/.local/lib/python3.8/site-packages (1.23.6)
Note: you may need to restart the kernel to use updated packages.


In [86]:
import fitz
import re

In [140]:
doc = fitz.open("../resources/testpdf/marx.pdf", filetype="pdf")

In [141]:
"""
Read a document page by page
"""
pages = doc.pages()

# This variable would be used if we had a method of identifying what 
# we wanted to redact for each page. 
redaction_annotations = {
    # page: [an1, an2...]
    1: [...],
    2: [...]
}

for page in pages:
    # search for annotations
    print(page)

page 0 of ../resources/testpdf/marx.pdf


In [142]:
"""
Function that extract the xref of an object in case of 
only one reference by object.
"""
def getObjectNum(xref):
    temp = xref[1].strip('[]').split()
    temp = [item.strip(",") for item in temp]
    return int(temp[0])

In [143]:
"""
Get Root object of doc (start point of reading a document)
"""
root = {}

# Recurring structure of getting key-value pairs of a dictonary of 
# an object. 
# TODO: make function
root_xref = doc.pdf_catalog()
for key in doc.xref_get_keys(root_xref):
    root[key] = doc.xref_get_key(root_xref, key)

print(root)

{'StructTreeRoot': ('xref', '3 0 R'), 'MarkInfo': ('dict', '<</Marked true>>'), 'Pages': ('xref', '4 0 R'), 'Type': ('name', '/Catalog'), 'PageLabels': ('dict', '<</Nums[0<</S/D>>]>>'), 'OutputIntents': ('array', '[5 0 R]'), 'PageMode': ('name', '/UseNone'), 'Lang': ('string', 'NL'), 'OpenAction': ('array', '[6 0 R/XYZ 0 841.9 0]'), 'Metadata': ('xref', '1 0 R')}


In [144]:
"""
Get Pages object of doc which contains the actual pages of the PDF. 
"""
xref = root['Pages']

pages = {}
xref = getObjectNum(xref)
for key in doc.xref_get_keys(xref):
    pages[key] = doc.xref_get_key(xref, key)

print(pages)

{'Type': ('name', '/Pages'), 'ITXT': ('string', '4.1.6'), 'Kids': ('array', '[6 0 R]'), 'Count': ('int', '1')}


In [145]:
"""
Extract object array function. Handle arrays with multiple references
to other objects. Returns a list with integer object xrefs. 
"""

def extractObjectArray(array):
    temp = array.strip('[]').split()
    temp = [item.strip(",") for item in temp]
    pdf_pages_xref = temp[::3]
    pdf_pages_xref = [int(x) for x in pdf_pages_xref]
    return pdf_pages_xref

In [146]:
"""
Get all xref of pdf pages
"""
import ast

page_object = pages['Kids']

pdf_pages_xref = []

if page_object[0] == 'array':
    pdf_pages_xref = extractObjectArray(page_object[1])
    
print(pdf_pages_xref)

[6]


In [147]:
"""
Extract info about the pages in the PDF
"""

pdf_pages = {}

for page_xref in pdf_pages_xref:
    print(page_xref)
    fields = {}
    for key in doc.xref_get_keys(page_xref):
        fields[key] = doc.xref_get_key(page_xref, key)
    pdf_pages[page_xref] = fields
    
print(pdf_pages)

6
{6: {'StructParents': ('int', '0'), 'Type': ('name', '/Page'), 'BleedBox': ('array', '[0 0 595.276 841.89]'), 'Contents': ('array', '[16 0 R 17 0 R]'), 'CropBox': ('array', '[0 0 595.276 841.89]'), 'MediaBox': ('array', '[0 0 595.276 841.89]'), 'Parent': ('xref', '4 0 R'), 'Resources': ('xref', '18 0 R'), 'TrimBox': ('array', '[0 0 595.276 841.89]')}}


In [148]:
"""
Extract info about resources used by page
"""
# in this case: the first page of the 'pdf_pages' list.
page = pdf_pages[6]

resources_xref = []

resources = page['Resources']
if resources[0] == 'xref':
    resources_xref = [getObjectNum(resources)]

page_resources = {}
if len(resources_xref) == 1:
    fields = {}
    for key in doc.xref_get_keys(resources_xref[0]):
        fields[key] = doc.xref_get_key(resources_xref[0], key)
        
    page_resources[resources_xref[0]] = fields
else:
    pass

print(page_resources)


{18: {'Font': ('dict', '<</F3 19 0 R/F2 20 0 R/F1 21 0 R/F0 22 0 R>>'), 'ProcSet': ('array', '[/PDF/ImageC/Text]'), 'XObject': ('dict', '<</I0 23 0 R>>')}}


In [252]:
"""
Extract fonts used by page
"""

page_fonts = {}

resource = page_resources[18]
fonts = resource['Font']
if fonts[0] == 'dict':
    fonts_ref = fonts[1].strip("<<>>").split("/")[1:]
    for font_ref in fonts_ref:
        tokens = font_ref.split(" ")
        page_fonts[tokens[0]] = int(tokens[1])
else:
    print("ERROR")

#print(page_fonts, "\n")

# iterate every font
page_fonts_info = {}
for font_xref in page_fonts.items():
    fields = {}
    for key in doc.xref_get_keys(font_xref[1]):
        fields[key] = doc.xref_get_key(font_xref[1], key)
    
    page_fonts_info[font_xref[0]] = fields
    
#print(page_fonts_info, "\n")

fonts = {}

# get to unicode
for font in page_fonts_info.items():
    toUnicode = {}
    descendant = {}
    toUnicode_xref = font[1]['ToUnicode']
    descendent_xref = font[1]['DescendantFonts']

    if toUnicode_xref[0] == 'xref':
        toUnicode_xref = getObjectNum(toUnicode_xref)
        fields = {}
        for key in doc.xref_get_keys(toUnicode_xref):
            fields[key] = doc.xref_get_key(toUnicode_xref, key)
        if stream := doc.xref_stream(toUnicode_xref):
            fields["Stream"] = stream.decode("latin-1")
        toUnicode = fields

    if descendent_xref[0] == 'array':
        descendent_xref = extractObjectArray(descendent_xref[1])[0]
        fields = {}
        for key in doc.xref_get_keys(descendent_xref):
            fields[key] = doc.xref_get_key(descendent_xref, key)
        if stream := doc.xref_stream(descendent_xref):
            fields["Stream"] = stream.decode("latin-1")
        descendant = fields
                              
    fonts[font[0]] = {'toUnicode': toUnicode, 'descendentFonts': descendant}

print(fonts)

{'F3': {'toUnicode': {'Length': ('int', '357'), 'Filter': ('name', '/FlateDecode'), 'Stream': '%!PS-Adobe-3.0 Resource-CMap\n/CIDInit /ProcSet findresource begin\n12 dict begin\nbegincmap\n/CIDSystemInfo\n<< /Registry (Adobe)\n/Ordering (UCS)\n/Supplement 0\n>> def\n/CMapName /AdHoc-UCS def\n/CMapType 2 def\n1 begincodespacerange\n<0000> <FFFF>\nendcodespacerange\n28 beginbfchar\n<0010> <0030>\n<0012> <0032>\n<0022> <0042>\n<0024> <0044>\n<002f> <004f>\n<0037> <0057>\n<0041> <0061>\n<0042> <0062>\n<0043> <0063>\n<0044> <0064>\n<0045> <0065>\n<0046> <0066>\n<0047> <0067>\n<0048> <0068>\n<0049> <0069>\n<004a> <006a>\n<004b> <006b>\n<004c> <006c>\n<004d> <006d>\n<004e> <006e>\n<0050> <0070>\n<0052> <0072>\n<0053> <0073>\n<0054> <0074>\n<0055> <0075>\n<0056> <0076>\n<00d4> <00eb>\n<00e5> <0020>\nendbfchar\nendcmap\nCMapName currentdict /CMap defineresource pop\nend\nend\n'}, 'descendentFonts': {'FontDescriptor': ('xref', '50 0 R'), 'W': ('array', '[16[444]18[444]34[611]36[611]47[611]55[889

In [273]:
"""
Parse character mapping of fonts
"""
fonts_mapping = {}
pattern = re.compile(r'beginbfchar\s*\n(.*?)(?=\nendbfchar)', re.DOTALL)
for font in fonts.items():
    cmap_stream = font[1]['toUnicode']['Stream']
    matches = pattern.findall(cmap_stream)
    if len(matches) > 1:
        print(error)
        break
    
    matches = matches[0].split("\n")
    mapping = {}
    for match in matches:
        item = match.split(" ")
        mapping[bytes.fromhex(item[0][1:-1]).decode('latin-1')] = chr(int(item[1][1:-1], 16))
        
    fonts_mapping[font[0]] = mapping

print(fonts_mapping)

{'F3': {'\x00\x10': '0', '\x00\x12': '2', '\x00"': 'B', '\x00$': 'D', '\x00/': 'O', '\x007': 'W', '\x00A': 'a', '\x00B': 'b', '\x00C': 'c', '\x00D': 'd', '\x00E': 'e', '\x00F': 'f', '\x00G': 'g', '\x00H': 'h', '\x00I': 'i', '\x00J': 'j', '\x00K': 'k', '\x00L': 'l', '\x00M': 'm', '\x00N': 'n', '\x00P': 'p', '\x00R': 'r', '\x00S': 's', '\x00T': 't', '\x00U': 'u', '\x00V': 'v', '\x00Ô': 'ë', '\x00å': ' '}, 'F2': {'\x00\r': '.', '\x00\x0f': '0', '\x00\x10': '1', '\x00\x11': '2', '\x00\x15': '6', '\x00\x18': '9', '\x00 ': 'A', '\x00!': 'B', '\x00"': 'C', '\x00#': 'D', '\x00$': 'E', "\x00'": 'H', '\x00+': 'L', '\x00-': 'N', '\x00/': 'P', '\x001': 'R', '\x002': 'S', '\x003': 'T', '\x006': 'W', '\x00@': 'a', '\x00A': 'b', '\x00B': 'c', '\x00C': 'd', '\x00D': 'e', '\x00G': 'h', '\x00K': 'l', '\x00L': 'm', '\x00M': 'n', '\x00O': 'p', '\x00Q': 'r', '\x00R': 's', '\x00S': 't', '\x00ä': ' '}, 'F1': {'\x00\x01': ' ', '\x00\x0c': ',', '\x00\r': '-', '\x00\x0e': '.', '\x00\x10': '0', '\x00\x11': '1', 

In [274]:
"""
Extract contents for given page
"""
# in this case: the first page of the 'pdf_pages' list.

page = pdf_pages[6]

contents = page['Contents']

# Store the xrefs of contents
contents_xref = []

if contents[0] == 'array':
    contents_xref = extractObjectArray(contents[1])

print(contents_xref)

# Get info about the contents of this page
page_contents = {}
for content_xref in contents_xref:
    fields = {}
    for key in doc.xref_get_keys(content_xref):
        fields[key] = doc.xref_get_key(content_xref, key)
    if stream := doc.xref_stream(content_xref):
        fields["Stream"] = stream.decode("latin")
    page_contents[content_xref] = fields
    
print(page_contents)

[16, 17]
{16: {'Length': ('int', '44'), 'Filter': ('name', '/FlateDecode'), 'Stream': 'q\n0 0 m\n595.276 0 l\n595.276 841.89 l\n0 841.89 l\nh\nW n\n'}, 17: {'Length': ('int', '2545'), 'Filter': ('name', '/FlateDecode'), 'Stream': '0 Tw\n0 Tc\n0 G\n0 g\n/Block <</MCID 0>>\nBDC\nBT\n/F0 12 Tf\n1 0 0 1 119.055 721.212 Tm\n(\\000$\\000D\\000Q)Tj\n1 0 0 1 141.483 721.212 Tm\n(\\000Y\\000U)Tj\n1 0 0 1 152.583 721.212 Tm\n(\\000D\\000D\\000J)Tj\n1 0 0 1 173.259 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 176.595 721.212 Tm\n(\\000Z)Tj\n1 0 0 1 187.023 721.212 Tm\n(\\000D\\000W)Tj\n1 0 0 1 198.123 721.212 Tm\n(\\000H\\000U\\000Y)Tj\n1 0 0 1 215.895 721.212 Tm\n(\\000H\\000U)Tj\n1 0 0 1 226.935 721.212 Tm\n(\\000J\\000X\\000Q\\000Q\\000L\\000Q\\000J)Tj\n1 0 0 1 274.263 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 277.599 721.212 Tm\n(\\000Y)Tj\n1 0 0 1 284.031 721.212 Tm\n(\\000R\\000R\\000U)Tj\n1 0 0 1 303.363 721.212 Tm\n(\\000\\003)Tj\n1 0 0 1 306.699 721.212 Tm\n(\\000K\\000H\\000W)Tj\n1 0 0 1 325.371 721.2

In [282]:
"""
Extract text elements of the content stream
"""
import re
import codecs

# lets use the second content stream of 'page_contents'

content_stream = page_contents[17]["Stream"]

matches = re.findall(r'BT(.*?)ET',content_stream, re.DOTALL)
for match in matches:
    decoded_text = codecs.escape_decode(match.strip())[0].decode('utf-8', errors='replace').replace('\\)', ')')
    print(decoded_text)
    # do something with decoded text: make it a dictonary?


/F0 12 Tf
1 0 0 1 119.055 721.212 Tm
( $ D Q)Tj
1 0 0 1 141.483 721.212 Tm
( Y U)Tj
1 0 0 1 152.583 721.212 Tm
( D D J)Tj
1 0 0 1 173.259 721.212 Tm
( )Tj
1 0 0 1 176.595 721.212 Tm
( Z)Tj
1 0 0 1 187.023 721.212 Tm
( D W)Tj
1 0 0 1 198.123 721.212 Tm
( H U Y)Tj
1 0 0 1 215.895 721.212 Tm
( H U)Tj
1 0 0 1 226.935 721.212 Tm
( J X Q Q L Q J)Tj
1 0 0 1 274.263 721.212 Tm
( )Tj
1 0 0 1 277.599 721.212 Tm
( Y)Tj
1 0 0 1 284.031 721.212 Tm
( R R U)Tj
1 0 0 1 303.363 721.212 Tm
( )Tj
1 0 0 1 306.699 721.212 Tm
( K H W)Tj
1 0 0 1 325.371 721.212 Tm
( )Tj
1 0 0 1 328.707 721.212 Tm
( X L W Y)Tj
1 0 0 1 350.475 721.212 Tm
( R H U)Tj
1 0 0 1 368.787 721.212 Tm
( H Q)Tj
1 0 0 1 382.791 721.212 Tm
( )Tj
1 0 0 1 386.127 721.212 Tm
( Y)Tj
1 0 0 1 392.319 721.212 Tm
( D Q)Tj
1 0 0 1 406.323 721.212 Tm
( )Tj
1 0 0 1 409.659 721.212 Tm
( Z)Tj
1 0 0 1 420.087 721.212 Tm
( D W)Tj
1 0 0 1 431.187 721.212 Tm
( H U)Tj
1 0 0 1 442.047 721.212 Tm
( K X L V K R X G N X Q G L J)Tj
1 0 0 1 534.471 721.212 

  decoded_text = codecs.escape_decode(match.strip())[0].decode('utf-8', errors='replace').replace('\\)', ')')


In [297]:
"""
Save text elements in datastructure

pdf_cords: {text: value, font: font_ref, rawstring: orginal_raw_string}

rawstring used for replacing in orginal string?

"""
text_elements = {}
for match in matches:
    font_info = {'reference': '/F0', 'fontsize': '11', 'match': ""}
    raw_string = ""
    split = match.strip().split("\n")
    for item in split:
        tokenized = item.split(" ")
        if tokenized[-1] == 'Tf':
            font_info['reference'] = tokenized[0]
            font_info['fontsize'] = tokenized[1]
            font_info['match'] = item
            continue 
        elif tokenized[-1] == 'Tm':
            x = float(tokenized[4])
            y = float(tokenized[5])
            raw_string = item
        elif len(tokenized) == 1:
            try: 
                pk = codecs.escape_decode(tokenized[0].strip())[0].decode('utf-8', errors='replace').replace('\\)', ')')
                pk = pk.split("(")[1].split(")")
                # decode string here

                # check if pk[1] == 'Tj'
                text = pk[0]
                raw_string = raw_string + "\n" + item
                text_elements[(x, y)] = {'font': font_info, 'string': text, 'match': raw_string}
            except ValueError:
                pass
                
print(text_elements)

{(119.055, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00$\x00D\x00Q', 'match': '1 0 0 1 119.055 721.212 Tm\n(\\000$\\000D\\000Q)Tj'}, (141.483, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00Y\x00U', 'match': '1 0 0 1 141.483 721.212 Tm\n(\\000Y\\000U)Tj'}, (152.583, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00D\x00D\x00J', 'match': '1 0 0 1 152.583 721.212 Tm\n(\\000D\\000D\\000J)Tj'}, (173.259, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00\x03', 'match': '1 0 0 1 173.259 721.212 Tm\n(\\000\\003)Tj'}, (176.595, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00Z', 'match': '1 0 0 1 176.595 721.212 Tm\n(\\000Z)Tj'}, (187.023, 721.212): {'font': {'reference': '/F0', 'fontsize': '12', 'match': '/F0 12 Tf'}, 'string': '\x00D\x00W', 'match': '1 

  pk = codecs.escape_decode(tokenized[0].strip())[0].decode('utf-8', errors='replace').replace('\\)', ')')


In [294]:
pages = doc.pages()
for page in pages:
    res = page.search_for('0652568389')
    
    # translate from PyMuPDF system to pdf system
    m = page.transformation_matrix
    print(m)
    print(res[0], res[0] * m)
    
    # check if previously extracted content stream contains
    # item contained within this rect.

Matrix(1.0, 0.0, 0.0, -1.0, 0.0, 841.8900146484375)
Rect(409.5469970703125, 149.40919494628906, 459.9205322265625, 160.25401306152344) Rect(409.5469970703125, 681.635986328125, 459.9205322265625, 692.4808349609375)


In [302]:
"""
Translate string of 'text_elements' using 'fonts_mapping'
"""
for text in text_elements.values():
    #print(text)
    font_reference = text['font']['reference']
    value = text['string']
    for item in value:
        # empty/space charachter
        if item == "\x00":
            continue
        print(fonts_mapping[font_reference.split("/")[1]]['\x00' + item], end="")
    
    

Aanvraag watervergunning voor het uitvoeren van waterhuishoudkundige werkzaamheden ter hoogte van de Biestraat 3 te Gilze. Aanvraag ontvangen op 14 september 2022 met registratienummer 0652568389 voor het planten van een haag en 2 bomen op de beschermingszone van een a-water ter hoogte van Biestraat 3 te Gilze.    u meer informatie wenst over de aanvraag kunt u contact opnemen via het telefoonnummer 076 564 13 45.   Breda, 22 september 2022 Nr. 10629 W22 september 2022 Officiële uitgave van het dagelijks bestuur van het Waterschap Brabantse Delta Waterschapsblad 2022 nr. 1062922 september 2022 1 