### Parsing and Preprocessing Index Pages

- filter tiny and huge text elements (i.e. deviating from average character/word width and height
- extract page lines that are part of the main text body containing index entries
- insert and clean up repetition symbols in index entries
    - determine length of repetition symbol
    - identify and replace mis-recognized repetition symbols


In [2]:
from parse_hocr_files import make_hocr_page
import republic_column_parser as column_parser
from elasticsearch import Elasticsearch
import republic_page_parser as page_parser
import republic_paragraph_parser as paragraph_parser
import republic_file_parser as file_parser

import copy

year = 1725
inventory_num = 3767
base_config = {
    "inventory_num": inventory_num,
    "base_dir": "../../../Data/Projects/REPUBLIC/hocr/",
    "page_index": "republic_hocr_pages",
    "page_doc_type": "page",
    "scan_index": "republic_hocr_scans",
    "scan_doc_type": "scan",
    "tiny_word_width": 15, # pixel width
    "avg_char_width": 20,
    "remove_tiny_words": True,
    "remove_line_numbers": False,
    "normal_scan_width": 4840
}


def set_config_inventory_num(base_config, inventory_num):
    config = copy.deepcopy(base_config)
    config["inventory_num"] = inventory_num
    config["data_dir"] = config["base_dir"] + "{}/".format(inventory_num)
    return config

inventory_config = set_config_inventory_num(base_config, inventory_num)
print(inventory_config)
scan_files = file_parser.get_files(inventory_config["data_dir"])
print("Number of scan files:", len(scan_files))

scan_file = scan_files[10]



{'inventory_num': 3767, 'base_dir': '../../../Data/Projects/REPUBLIC/hocr/', 'page_index': 'republic_hocr_pages', 'page_doc_type': 'page', 'scan_index': 'republic_hocr_scans', 'scan_doc_type': 'scan', 'tiny_word_width': 15, 'avg_char_width': 20, 'remove_tiny_words': True, 'remove_line_numbers': False, 'normal_scan_width': 4840, 'data_dir': '../../../Data/Projects/REPUBLIC/hocr/3767/'}
Number of scan files: 774


In [None]:
#from republic_index_page_parser import index_lemmata
from collections import defaultdict
import republic_index_page_parser as index_parser
import republic_elasticsearch as rep_es

avg_left = 0
lemma_index = defaultdict(list)
curr_lemma = None
    

for page_id in pages_info:
    page_doc = rep_es.retrieve_page_doc(page_id, year_config)
    print("\n\n", page_id)
    if page_doc["page_type"] != "index_page":
        print("skipping non-index page")
        continue
    page_doc["num_page_ref_lines"] = index_parser.count_page_ref_lines(page_doc)
    for column_info in page_doc["columns"]:
        print("\n\n", column_info["column_id"])
        column_hocr = column_info["column_hocr"]
        lines = index_parser.get_index_entry_lines(column_hocr)
        curr_lemma = index_parser.index_lemmata(column_info["column_id"], lines, lemma_index, curr_lemma)
        print("returned lemma:", curr_lemma)




### Fuzzy Searching of Keywords in the Resolutions

Knowing which keywords should appear in the text, possibly with some spelling variation and OCR errors, we can use a fuzzy search algorithm to find candidate matches. 

Keywords that are similar to each other are registered as distractor terms, so matches are assigned as candidates to the nearest of sets of similar keywords. 

In [1]:
from parse_republic_hocr_files import merge_text_lines, read_hocr_scan

lemma_matches = defaultdict(list)

def add_context(match, page_text):
    context = fuzzy_searcher.get_term_context(page_text, match, context_size=40)
    match["match_term_in_context"] = context["match_term_in_context"]
    match["context_start_offset"] = context["start_offset"]
    match["context_end_offset"] = context["end_offset"]

for scan_file in scan_files:
    resolution_page_num = scan_file["scan_page_num"] - 90
    if scan_file["scan_page_num"] <= 90:
        continue
    print(scan_file["scan_page_num"], resolution_page_num)
    hocr_page = read_hocr_scan(scan_file)
    page_text = merge_text_lines(hocr_page)
    matches = fuzzy_searcher.find_candidates(page_text)
    for match in matches:
        lemma_matches[match["match_keyword"]] += [match]
        add_context(match, page_text)
        match["page_num"] = scan_file["scan_page_num"]
        print(match["match_keyword"], "\t", match)
    #break
    

ModuleNotFoundError: No module named 'parse_republic_hocr_files'

In [3]:
for lemma in sorted(lemma_matches):
    print("\n", lemma, "\tAantal kandidaten:", len(lemma_matches[lemma]), "\n")
    for match in lemma_matches[lemma]:
        print("\tKandidaat:", match["match_string"])
        print("\tPagina:", match["page_num"])
        print("\tContext:", match["match_term_in_context"][5:-5])
        print()


for lemma in lemma_index:
    print("\nTrefwoord:", lemma)
    #print(lemma_index[lemma])
    for entry in lemma_index[lemma]:
        pages = ", ".join([str(page_ref) for page_ref in entry["page_refs"]])
        description = entry["description"][:70]
        print("\tPagina:", pages, "\tBeschrijving:", description)

NameError: name 'lemma_matches' is not defined

In [None]:
# scan 45 uneven is first resolution page
# page num: 91

from fuzzy_context_searcher import FuzzyContextSearcher
import pandas as pd

config = {
    "char_match_threshold": 0.8,
    "ngram_threshold": 0.6,
    "levenshtein_threshold": 0.8,
    "ignorecase": False,
    "ngram_size": 3,
    "skip_size": 0,
}

fuzzy_searcher = FuzzyContextSearcher(config)

keywords = [
    "Admiraliteyt tot Amfterdam", 
    "Admiraliteyt in het Noorder Quartier", 
    "Admiraliteyt in Vrieslandt", 
    "Admiralteyt in Zeelandt",
    "Varckens"
]

distractor_terms = {
    "Admiraliteyt tot Amfterdam": {
        "Admiraliteyt in het Noorder Quartier", "Admiraliteyt in Vrieslandt", "Admiralteyt in Zeelandt"
    },
    "Admiraliteyt in het Noorder Quartier": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in Vrieslandt", "Admiralteyt in Zeelandt"
    },
    "Admiraliteyt in Vrieslandt": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in het Noorder Quartier", "Admiralteyt in Zeelandt"
    },
    "Admiralteyt in Zeelandt": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in het Noorder Quartier", "Admiraliteyt in Vrieslandt"
    },
}
fuzzy_searcher.index_keywords(keywords)
fuzzy_searcher.index_distractor_terms(distractor_terms)

hocr_resolution_pages = []

