### Parsing and Preprocessing Index Pages

- filter tiny and huge text elements (i.e. deviating from average character/word width and height
- extract page lines that are part of the main text body containing index entries
- insert and clean up repetition symbols in index entries
    - determine length of repetition symbol
    - identify and replace mis-recognized repetition symbols


In [1]:
# This reload library is just used for developing the REPUBLIC hOCR parser 
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2


# This is needed to add the repo dir to the path so jupyter
# can load the republic modules directly from the notebooks
import os
import sys
repo_dir = os.path.split(os.getcwd())[0]
if repo_dir not in sys.path:
    sys.path.append(repo_dir)

In [2]:
from collections import defaultdict

from elasticsearch import Elasticsearch
import republic.parser.republic_file_parser as file_parser
from republic.config.republic_config import base_config, set_config_inventory_num


es = Elasticsearch()

year = 1725
inventory_num = 3780
data_dir = "/Users/marijnkoolen/Data/Projects/REPUBLIC/hocr"


def get_pages_info(config):
    scan_files = file_parser.get_files(config["data_dir"])
    print("Number of scan files:", len(scan_files))
    return file_parser.gather_page_columns(scan_files)

inv_config = set_config_inventory_num(base_config, inventory_num, data_dir)
pages_info = get_pages_info(inv_config)



AssertionError: 

In [3]:
#from republic_index_page_parser import index_lemmata
from collections import defaultdict
import republic.parser.republic_index_page_parser as index_parser
import republic.elastic.republic_elasticsearch as rep_es

avg_left = 0
lemma_index = defaultdict(list)
curr_lemma = None
    
pages = rep_es.retrieve_pages_by_type(es, "index_page", inventory_num, inv_config)
for page_doc in sorted(pages, key = lambda x: x["page_num"]):
    #print("\n\n", page_doc["page_id"])
    if "index_page" not in page_doc["page_type"]:
        print("skipping non-index page")
        continue
    page_doc["num_page_ref_lines"] = index_parser.count_page_ref_lines(page_doc)
    for column_hocr in page_doc["columns"]:
        lines = index_parser.get_index_entry_lines(column_hocr)
        curr_lemma = index_parser.index_lemmata(lines, lemma_index, curr_lemma)
        #print("returned lemma:", curr_lemma)




ModuleNotFoundError: No module named 'republic.parser.republic_index_page_parser'

In [25]:
lemmas = [lemma for lemma in lemma_index if len(lemma) >= 5]

for lemma in lemma_index.keys():
    print(lemma)


van van Sommelsdyck
Admiraliteyten
Admiraliteyt tot Amfterdam
Admiraliteyt in het Noorder Quartier
Admiralteyt in Zeelandt
Admiraliteyt in Vrieslandt
Siet verder Zeefaken
Advocaten
van Affelen
van Afferden
Akerlaken
van Alblaflerdam
Altona
Amiot
Ardes
Arnols
van Afperen
van Aflendelft
Ayen
Aylua
Baake
Backer
Barbut
Baffecour
Baftie ‚
de Baudous
Beaufort aangefteldt tot Secretaris
Becker
Becker 07% afiftentie
Beelt{nyder
Bel
de Bel
Belvis de Seville hondert guldens toegeleght
van Bentheim-Steinfurt
Benthem
de Bere
Bergen op den Zoom
Beyeren
Beyer
Bilderbeeck
Binkhorft
Berckenfeldt
Bleskensgrave
Bleyfwyck
Bloemaarts
Blyendaal
Bocteman 0 approbatie van ftekere collatie
Boeye
van Bommel
Bonnel
Bodens
Boreel
Borgers
Breder
Brieven
Brincko
Broegh
Bronckhorft
de Brofe
Vi ie raakende de Ouitantie
Bruyninx
in Ooft-Vrieslandt
Collegie ter Admiraliteyt tot Amfterdam
Burmania
Bie
Buys
Buys declaratie
Buytenhem
Carrier
Cafenbroot
de Caffamajor executoriaal
Commercie in de Ooftenrykfe Nederlanden
Co

In [19]:
def show_page_as_spaced_text(es, page_num, inventory_config):
    page = rep_es.retrieve_page_by_page_number(es, page_num, inventory_config)
    for column in page["columns"]:
        print("Column:\n")
        for line in column["lines"]:
            print("{:5}".format(line["left"]), line["spaced_line_text"])
        print("\n\n")
    return True

for page_num in range(15,16):
    show_page_as_spaced_text(es, page_num, inv_config)




Column:

  691                                    N
  112       vincien  aangenoomen  haar  te  verklaaren.
  111       675.
  243             Gecommitteerden geadmitteert  op  de
  110       beloften als nu te doen in plaats van op den
  109      vierden Oftober 685.
  427                      wegens Vrieslandt voor den
  109      Heere Lyclama a Nyeholt.   787.
  231             bericht op de klaghten van de Admi-
  107      valiteyt op de Maze over den Convoymeefter
  109      tot Swolle , te examinceren.   810.
   63    — advis op het verfoeck van Beugholt,
  110       Ba(ccour  en  Vechoven ,  te
  111       815.
  351                   wegens Reglement voor de Yck-
  108      meefters, te examineeren.   816.
   34   me rapport wegens Bedienden van ’t kleyn
  109      Zegel en refolutie.   843.
   61    Siet verder Zeefaken.
   59    Advocaten  van het Landt  te advifteren in
  107      de fake van den Prince van Hornes tegens
  108      la Motte.   15.
   62    — advis en refolut

### Fuzzy Searching of Keywords in the Resolutions

Knowing which keywords should appear in the text, possibly with some spelling variation and OCR errors, we can use a fuzzy search algorithm to find candidate matches. 

Keywords that are similar to each other are registered as distractor terms, so matches are assigned as candidates to the nearest of sets of similar keywords. 

In [26]:
from republic.fuzzy.fuzzy_context_searcher import FuzzyContextSearcher, get_term_context

config = {
    "char_match_threshold": 0.8,
    "ngram_threshold": 0.6,
    "levenshtein_threshold": 0.8,
    "ignorecase": False,
    "ngram_size": 3,
    "skip_size": 0,
}

fuzzy_lemma_searcher = FuzzyContextSearcher(config)

fuzzy_lemma_searcher.index_keywords(lemmas)

In [37]:
from republic.parser.republic_base_page_parser import merge_text_lines
from republic.parser.generic_hocr_parser import make_hocr_doc
import json

lemma_matches = defaultdict(list)

def add_context(match, page_text):
    context = get_term_context(page_text, match, context_size=40)
    match["match_term_in_context"] = context["match_term_in_context"]
    match["context_start_offset"] = context["start_offset"]
    match["context_end_offset"] = context["end_offset"]

for page_doc in rep_es.retrieve_resolution_pages(es, inv_config["inventory_num"], inv_config):
    print("\n\n", page_id)
    if "index_page" in page_doc["page_type"]:
        print("skipping index page")
        continue
    for column_hocr in page_doc["columns"]:
        page_text = " ".join([line["line_text"] for line in column_hocr["lines"]])
        matches = fuzzy_lemma_searcher.find_candidates(page_text)
        for match in matches:
            lemma_matches[match["match_keyword"]] += [match]
            add_context(match, page_text)
            match["page_num"] = page_doc["page_num"]
            print(match["match_keyword"], "\n", json.dumps(match, indent=2))
    break




 year-1725-scan-49-even
van Haaren 
 {
  "match_keyword": "van Haaren",
  "match_term": "van Haaren",
  "match_string": "van haare",
  "match_offset": 147,
  "char_match": 0.8,
  "ngram_match": 0.6363636363636364,
  "levenshtein_distance": 0.8,
  "match_term_in_context": "ien foo defectueus zyn in het furneeren van haare contingenten in de renten en intereflen ",
  "context_start_offset": 107,
  "context_end_offset": 197,
  "page_num": 599
}
Wirtembergh 
 {
  "match_keyword": "Wirtembergh",
  "match_term": "Wirtembergh",
  "match_string": "Wirtembergh",
  "match_offset": 1077,
  "char_match": 1.0,
  "ngram_match": 1.0,
  "levenshtein_distance": 1.0,
  "match_term_in_context": "tfangen een van den Heere O Hertogh van Wirtembergh, getchreven te Deinach den negentienden ",
  "context_start_offset": 1037,
  "context_end_offset": 1129,
  "page_num": 599
}
Swabifche 
 {
  "match_keyword": "Swabifche",
  "match_term": "Swabifche",
  "match_string": "Swabilchen",
  "match_offset": 1239,
  "c

In [3]:
for lemma in sorted(lemma_matches):
    print("\n", lemma, "\tAantal kandidaten:", len(lemma_matches[lemma]), "\n")
    for match in lemma_matches[lemma]:
        print("\tKandidaat:", match["match_string"])
        print("\tPagina:", match["page_num"])
        print("\tContext:", match["match_term_in_context"][5:-5])
        print()


for lemma in lemma_index:
    print("\nTrefwoord:", lemma)
    #print(lemma_index[lemma])
    for entry in lemma_index[lemma]:
        pages = ", ".join([str(page_ref) for page_ref in entry["page_refs"]])
        description = entry["description"][:70]
        print("\tPagina:", pages, "\tBeschrijving:", description)

NameError: name 'lemma_matches' is not defined

In [None]:
# scan 45 uneven is first resolution page
# page num: 91

from fuzzy_context_searcher import FuzzyContextSearcher

config = {
    "char_match_threshold": 0.8,
    "ngram_threshold": 0.6,
    "levenshtein_threshold": 0.8,
    "ignorecase": False,
    "ngram_size": 3,
    "skip_size": 0,
}

fuzzy_searcher = FuzzyContextSearcher(config)

keywords = [
    "Admiraliteyt tot Amfterdam", 
    "Admiraliteyt in het Noorder Quartier", 
    "Admiraliteyt in Vrieslandt", 
    "Admiralteyt in Zeelandt",
    "Varckens"
]

distractor_terms = {
    "Admiraliteyt tot Amfterdam": {
        "Admiraliteyt in het Noorder Quartier", "Admiraliteyt in Vrieslandt", "Admiralteyt in Zeelandt"
    },
    "Admiraliteyt in het Noorder Quartier": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in Vrieslandt", "Admiralteyt in Zeelandt"
    },
    "Admiraliteyt in Vrieslandt": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in het Noorder Quartier", "Admiralteyt in Zeelandt"
    },
    "Admiralteyt in Zeelandt": {
        "Admiraliteyt tot Amfterdam", "Admiraliteyt in het Noorder Quartier", "Admiraliteyt in Vrieslandt"
    },
}
fuzzy_searcher.index_keywords(keywords)
fuzzy_searcher.index_distractor_terms(distractor_terms)

hocr_resolution_pages = []

