In [66]:
import fitz
import re

In [114]:
PDF_PATH = 'hp1.pdf'
doc = fitz.open(PDF_PATH)
print(doc)

Document('hp1.pdf')


In [3]:
# bbox format: [x1, y1, x2, y2]
# origin format: [x, y]

In [113]:
UNCERTAINTY_END = 10
UNCERTAINTY_SIZE = 2
UNCERTAINTY_LINE_HEIGTH = 2

REMOVE_PAGE_NUMBER = True

PAGE_NUM_PATTERN = r'[0-9]+'
#PAGE_NUM_PATTERN = r'\- [0-9]+ \-'

In [5]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return l

In [101]:
def get_spans(book, starting_page=0):
    spans = []
    for page_nr in range(starting_page, book.page_count):
        blocks = book[page_nr].get_text('dict', flags=11)['blocks']
        for block in blocks:
            if block['type'] == 0: # Text Block
                # Remove the page number if needed
                if REMOVE_PAGE_NUMBER and len(block['lines']) == 1 and len(block['lines'][0]['spans']) == 1 and bool(re.match(PAGE_NUM_PATTERN, block['lines'][0]['spans'][0]['text'].strip())):
                   continue
                for line in block['lines']:
                    for span in line['spans']:
                        if span['text'].strip() != '':
                            spans.append({
                                'text': span['text'],
                                'size': round(span['size']),
                                'y_org': round(span['origin'][1]),
                                'x_end': round(span['bbox'][2]),
                                'flags': span['flags']
                            })
            if block['type'] == 1: # Image Block
                pass # TODO
    return spans

In [115]:
spans = get_spans(doc, starting_page=6)
spans

[{'text': 'Ein Junge Überlebt',
  'size': 22,
  'y_org': 63,
  'x_end': 283,
  'flags': 4},
 {'text': 'Mr. und Mrs. Dursley im Ligusterweg Nummer 4 waren stolz',
  'size': 11,
  'y_org': 105,
  'x_end': 354,
  'flags': 4},
 {'text': 'darauf, ganz und gar normal zu sein, sehr stolz sogar. Niemand wäre',
  'size': 11,
  'y_org': 118,
  'x_end': 354,
  'flags': 4},
 {'text': 'auf die Idee gekommen, sie könnten sich in eine merkwürdige und',
  'size': 11,
  'y_org': 131,
  'x_end': 354,
  'flags': 4},
 {'text': 'geheimnisvolle Geschichte verstricken, denn mit solchem Unsinn',
  'size': 11,
  'y_org': 144,
  'x_end': 354,
  'flags': 4},
 {'text': 'wollten sie nichts zu tun haben.',
  'size': 11,
  'y_org': 157,
  'x_end': 179,
  'flags': 4},
 {'text': 'Mr. Dursley war Direktor einer Firma namens Grunnings, die',
  'size': 12,
  'y_org': 171,
  'x_end': 353,
  'flags': 4},
 {'text': 'Bohrmaschinen herstellte. Er war groß und bullig und hatte fast',
  'size': 12,
  'y_org': 186,
  'x_end': 35

In [77]:
def same(a, b, uncertainty):
    return a <= b + uncertainty and a >= b - uncertainty

In [78]:
def group_spans_in_lines(spans):
    lines = []
    current_line_y = spans[0]['y_org']
    current_span_list = []
    for span in spans:
        if same(span['y_org'], current_line_y, UNCERTAINTY_LINE_HEIGTH):
            current_span_list.append(span)
        else:
            lines.append(current_span_list)
            current_line_y = span['y_org']
            current_span_list = [span]
    lines.append(current_span_list)
    return lines

In [103]:
lines = group_spans_in_lines(spans)
lines

[[{'text': 'Dudley umnachtet ',
   'size': 20,
   'y_org': 72,
   'x_end': 297,
   'flags': 20}],
 [{'text': 'Der bislang heißeste Tag des Sommers neigte sich dem Ende zu ',
   'size': 11,
   'y_org': 105,
   'x_end': 370,
   'flags': 4}],
 [{'text': 'und eine schläfrige Stille lag über den großen wuchtigen Häusern des ',
   'size': 11,
   'y_org': 118,
   'x_end': 371,
   'flags': 4}],
 [{'text': 'Ligusterwegs. Autos, die normalerweise glänzten, standen staubig in ',
   'size': 11,
   'y_org': 130,
   'x_end': 371,
   'flags': 4}],
 [{'text': 'den Einfahrten, und Rasenflächen, die einst smaragdgrün waren, lagen ',
   'size': 11,
   'y_org': 142,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'verdorrt und gelbstichig da – wegen der Dürre war es verboten ',
   'size': 11,
   'y_org': 155,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'worden, sie mit Gartenschläuchen zu wässern. Die Bewohner des ',
   'size': 11,
   'y_org': 168,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'Ligusterwegs,

In [80]:
def get_text_constants(lines):
    text_sizes = {}
    text_ends = {}
    for line in lines:
        span = line[-1]
        text_len, text_size, text_end = len(span['text']), span['size'], span['x_end']
        if not (text_size in text_sizes):
            text_sizes[text_size] = 0
        text_sizes[text_size] += text_len
        if not (text_end in text_ends):
            text_ends[text_end] = 0
        text_ends[text_end] += 1
    #return text_sizes, text_ends
    return max(text_sizes, key=text_sizes.get), max(text_ends, key=text_ends.get)

In [104]:
text_size, end_pos = get_text_constants(lines)
print(f'{text_size=}, {end_pos=}')

text_size=11, end_pos=372


In [105]:
lines

[[{'text': 'Dudley umnachtet ',
   'size': 20,
   'y_org': 72,
   'x_end': 297,
   'flags': 20}],
 [{'text': 'Der bislang heißeste Tag des Sommers neigte sich dem Ende zu ',
   'size': 11,
   'y_org': 105,
   'x_end': 370,
   'flags': 4}],
 [{'text': 'und eine schläfrige Stille lag über den großen wuchtigen Häusern des ',
   'size': 11,
   'y_org': 118,
   'x_end': 371,
   'flags': 4}],
 [{'text': 'Ligusterwegs. Autos, die normalerweise glänzten, standen staubig in ',
   'size': 11,
   'y_org': 130,
   'x_end': 371,
   'flags': 4}],
 [{'text': 'den Einfahrten, und Rasenflächen, die einst smaragdgrün waren, lagen ',
   'size': 11,
   'y_org': 142,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'verdorrt und gelbstichig da – wegen der Dürre war es verboten ',
   'size': 11,
   'y_org': 155,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'worden, sie mit Gartenschläuchen zu wässern. Die Bewohner des ',
   'size': 11,
   'y_org': 168,
   'x_end': 372,
   'flags': 4}],
 [{'text': 'Ligusterwegs,

In [83]:
def lines_to_paragraphs(lines, text_size, end_pos):
    paragraphs = []
    current_paragraph_spans = []
    current_paragraph_size = lines[0][0]['size']
    for line in lines:

        # If a new size comes, create a new paragraph
        if not same(line[0]['size'], current_paragraph_size, UNCERTAINTY_SIZE):
            paragraphs.append({
                'size': current_paragraph_size,
                'spans': current_paragraph_spans
            })
            current_paragraph_spans = []
            current_paragraph_size = line[0]['size']
        
        # Add the spans to the current paragraph
        for span in line:
            current_paragraph_spans.append({
                'text': span['text'],
                'size': span['size'],
                'flags': span['flags']
            })

        # Remove '-' from line break if a word got cut
        if current_paragraph_spans[-1]['text'][-1] == '-':
            current_paragraph_spans[-1]['text'] = current_paragraph_spans[-1]['text'][:-1]
        elif current_paragraph_spans[-1]['text'][-1] != ' ':
            current_paragraph_spans[-1]['text'] = current_paragraph_spans[-1]['text'] + ' '

        # End the paragraph if the lines doesn't go until the end
        if line[-1]['x_end'] <= end_pos - UNCERTAINTY_END and line[0]['size'] <= text_size + UNCERTAINTY_SIZE:
            paragraphs.append({
                'size': current_paragraph_size,
                'spans': current_paragraph_spans
            })
            current_paragraph_spans = []
            current_paragraph_size = line[0]['size']
    
    # Add the last paragraph
    paragraphs.append({
        'size': current_paragraph_size,
        'spans': current_paragraph_spans
    })

    return paragraphs

In [106]:
ps = lines_to_paragraphs(lines, text_size, end_pos)
ps

[{'size': 20,
  'spans': [{'text': 'Dudley umnachtet ', 'size': 20, 'flags': 20}]},
 {'size': 11,
  'spans': [{'text': 'Der bislang heißeste Tag des Sommers neigte sich dem Ende zu ',
    'size': 11,
    'flags': 4},
   {'text': 'und eine schläfrige Stille lag über den großen wuchtigen Häusern des ',
    'size': 11,
    'flags': 4},
   {'text': 'Ligusterwegs. Autos, die normalerweise glänzten, standen staubig in ',
    'size': 11,
    'flags': 4},
   {'text': 'den Einfahrten, und Rasenflächen, die einst smaragdgrün waren, lagen ',
    'size': 11,
    'flags': 4},
   {'text': 'verdorrt und gelbstichig da – wegen der Dürre war es verboten ',
    'size': 11,
    'flags': 4},
   {'text': 'worden, sie mit Gartenschläuchen zu wässern. Die Bewohner des ',
    'size': 11,
    'flags': 4},
   {'text': 'Ligusterwegs, die sich nun nicht mehr wie üblich mit Autowaschen ',
    'size': 11,
    'flags': 4},
   {'text': 'und Rasenmähen die Zeit vertreiben konnten, hatten sich in die ',
    'size': 11,

In [85]:
def combine_spans(paragraphs, text_size):

    def inner_combine(paragraph):
        paragraph_text = ''
        for span in paragraph['spans']:
            text = span['text']
            flags = flags_decomposer(span['flags'])
            if 'italic' in flags:
                text = '<i>' + text + '</i>'
            if 'bold' in flags:
                text = '<b>' + text + '</b>'
            if 'monospaced' in flags:
                text = '<tt>' + text + '</tt>'
            if 'superscript' in flags:
                text = '<sup>' + text + '</sup>'
            if span['size'] < text_size:
                text = '<small>' + text + '</small>'
            paragraph_text += text
        return {
            'size': paragraph['size'],
            'text': paragraph_text
        }
    
    return list(map(inner_combine, paragraphs))

In [116]:
ps = combine_spans(ps, text_size)
ps

KeyError: 'spans'

In [87]:
def paragraphs_to_chapters(paragraphs, text_size):
    chapters = []
    current_chapter_title = ''
    current_chapter_content = ''

    for paragraph in paragraphs:
        
        # If the text size is great, create a new chapter
        if paragraph['size'] >= text_size + UNCERTAINTY_SIZE:
            chapters.append({
                'title': current_chapter_title,
                'content': current_chapter_content
            })
            current_chapter_title = paragraph['text']
            current_chapter_content = ''

        # Else add the paragraph
        else:
            current_chapter_content += '<p>' + paragraph['text'] + '</p>'
        
    # Add final chapter
    chapters.append({
        'title': current_chapter_title,
        'content': current_chapter_content
    })
    
    return chapters[1:]

In [108]:
chapters = paragraphs_to_chapters(ps, text_size)
for (i, chapter) in enumerate(chapters):
    title = chapter['title']
    print(f'{i+1:02} :: {title}')

01 :: <b>Dudley umnachtet </b>
02 :: <b>Eulen über Eulen </b>
03 :: <b>Die Vorhut </b>
04 :: <b>Grimmauldplatz Nummer zwölf </b>
05 :: <b>Der Orden des Phönix </b>
06 :: <b>Das fürnehme und gar alte Haus </b><b>der Blacks </b>
07 :: <b>Das Zaubereiministerium </b>
08 :: <b>Die Anhörung </b>
09 :: <b>Mrs. Weasleys Wehklage </b>
10 :: <b>Luna Lovegood </b>
11 :: <b>Das neue Lied des Sprechenden </b><b>Huts </b>
12 :: <b>Professor Umbridge </b>
13 :: <b>Strafarbeit bei Dolores </b>
14 :: <b>Percy und Tatze </b>
15 :: <b>Die Großinquisitorin von Hogwarts </b>
16 :: <b>Im Eberkopf </b>
17 :: <b>Ausbildungserlass Nummer </b><b>vierundzwanzig </b>
18 :: <b>Dumbledores Armee </b>
19 :: <b>Der Löwe und die Schlange </b>
20 :: <b>Hagrids Geschichte </b>
21 :: <b>Das Auge der Schlange </b>
22 :: <b>St.-Mungo-Hospital für Magische </b><b>Krankheiten und Verletzungen </b>
23 :: <b>Weihnachten auf der geschlossenen </b><b>Station </b>
24 :: <b>Okklumentik </b>
25 :: <b>Der Käfer in der Klemme </b>
2

In [111]:
def parse_book(file_path, starting_page=0):
    book = fitz.open(file_path)
    
    spans = get_spans(book, starting_page=starting_page)
    lines = group_spans_in_lines(spans)

    text_size, end_pos = get_text_constants(lines)
    print(f'Found ({text_size=}, {end_pos=})')

    paragraphs = lines_to_paragraphs(lines, text_size, end_pos)
    paragraphs = combine_spans(paragraphs, text_size)
    chapters = paragraphs_to_chapters(paragraphs, text_size)
    
    return chapters

In [118]:
chs = parse_book('hp1.pdf', starting_page=6)

Found (text_size=12, end_pos=354)


In [None]:
# TODO:
#  - Ersetze HTML Chars