# Koppelen van Instellingen

waar is het rijkskamergericht gebleven?


In [None]:
%reload_ext autoreload
%autoreload 2
import csv, re, os, sys
from tqdm.notebook import tqdm
from datetime import datetime
from multiprocess import Pool
from typing import Optional, Union
from IPython.display import display, HTML

repo_name = 'republic-project'
repo_dir = os.getcwd().split(repo_name)[0] + repo_name
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
    sys.path.append(f'{repo_dir}/enrichment/common')

from entity_match import *
import entity_linking_helpers as elh

elh.CPU_CORES = 6

In [None]:
SESSIONS_FILE = f'{repo_dir}/data/entities/annotations-unaggregated/session_dates.tsv'

session_years = { }

with open(SESSIONS_FILE) as file:
    for row in csv.DictReader(file, delimiter='\t'):
        y = row['session_date'][0:4]
        if y == '':
            session_years[row['session_id']] = None
        else:
            session_years[row['session_id']] = int(y)

In [None]:
class Reference:
    def __init__(self, row):
        self.text = row['improved_tag_text']
        del row['improved_tag_text']
        self.reference = EntityReference(**row)
        self.year = session_years[re.sub(r'-resolution.*', '', self.reference.resolution_id)]
        self.location = None
        self.prev_provenance = []
    def __repr__(self) -> str:
        return f'({self.reference.inv}/{self.reference.paragraph_id}) {self.text}'

references = [ ]

In [None]:
DATA_FILE = 'annotations-layer_ORG'

prev_prov = { }
with open(f'{DATA_FILE}.provenance.tsv') as file:
    for row in csv.reader(file, delimiter='\t'):
        if row[1] not in prev_prov:
            prev_prov[row[1]] = []
        prev_prov[row[1]].append(EntityProvenance(row[2],row[3],row[4],row[5]))

In [None]:
references = [ ]
with open(f'{DATA_FILE}.tsv') as file:
    fnr = 1
    for row in csv.DictReader(file, delimiter='\t'):
        fnr = fnr + 1
        ref = Reference(row)
        ref.prev_provenance = prev_prov.get(f'{fnr}', [])
        references.append(ref)

print(f'Read {len(references)} references')

## Harmoniseren van de spelling

In [None]:
spelling_regexes = [ ]

spelling_extra = [
#    (r' *\bde[nrs]? +', ' '),
#    (r" *\b(het|'s|een) +", ' '),
    (r'[Hh]ee?ren [Gg]e[a-z]*den +', ''),
    (r' meerge(noemde|melde) ', ' '),
    (r' *\b(ee?nige|enkele) +', ' '),
    (r'\bt\'([A-Za-z]+)', r'het \1'),
    (r'Admiraal', 'Admiraliteit'),
    (r'G[a-z]+ Ree?c?kenk', 'Generaliteyts Rekenk'),
    (r'(^ +| +$)', ''),
    (r'^(het|de[nrs]?) ', ''),
    (r'^hr[ny] ', 'Heeren'),
    (r'Raide van', 'Raad van'),
    (r'Groningen van Sweeden', 'Kronen van Sweeden')
]

with open('../criteria/common/spelling-regexes.tsv') as file:
    for (pat, repl, _) in csv.reader(file, delimiter='\t'):
        spelling_regexes.append((re.compile(pat), repl))

for (pat, repl) in spelling_extra:
    spelling_regexes.append((re.compile(pat), repl))

def harmonise_spelling(s: str) -> str:
    for (pat, repl) in spelling_regexes:
        s = pat.sub(repl, s)
    return s

for r in tqdm(references):
    r.text = harmonise_spelling(r.text)

## Inlezen ingesloten plaatsnamen

This section is a placeholder for properly-recognised placenames.

In [None]:
locations = { }

with open('locations.tsv') as file:
    for row in csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_NONE):
        parid = row['paragraph_id']
        if parid not in locations:
            locations[parid] = []
        locations[parid].append(row)

for r in tqdm(references):
    ref = r.reference
    r.location = None
    for row in locations.get(ref.paragraph_id, []):
        offset = int(row['offset'])
        if int(ref.offset) <= offset and offset <= int(ref.end):
            r.location = row['match']
            break

In [None]:
LOC_INFO = '../LOC/loc-entities-full.tsv'

location_info = { }

with open(LOC_INFO) as file:
    for row in csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_NONE):
        row["Labels"] = set(re.split(' *[|] *', row["Labels"]))
        location_info[row["Id"]] = row

In [None]:
[(r.reference.paragraph_id, r.text, r.location) for r in references if r.location]

## Groeperen identieke verwijzingen

In [None]:
reference_links = {}

class ReferenceLink:
    '''
    Contains (partial) information for connecting an entity reference to a known entity.
    '''
    def __init__(self, text: str, refs: [Reference]):
        self.text = text
        self.refs = refs
        self.categories = { }
        self.matches = []
        self.flags = set()
    def __repr__(self):
        return self.text
    def __len__(self):
        return len(self.refs)
    def add_match(self, match):
        self.matches = [ m for m in self.matches if m.label != match.label ]
        self.matches.append(match)

cat_keyword_normalisations = { }
def extract_category_keyword(m : PhraseMatch):
    'See below, where the category keyword file is read.'
    variant = m.variant.phrase_string
    return cat_keyword_normalisations.get(variant,variant)

class ReferenceCategory:
    '''
    First categorisation is keyword-based. For one category, multiple keywords 
    can be recognised; all are kept here because further, category-specific processing 
    may depend on the earlier-recognised keywords.
    '''
    def __init__(self, reference, matches : [PhraseMatch], provenance = None):
        self.category = reference
        self.keywords = { }
        for m in matches:
            self.keywords[extract_category_keyword(m)] = \
                EntityProvenance.from_match(reference, m,
                    conclusion = f'Intermediate classification: {m.label}')
        if provenance is not None:
            self.provenance = provenance
        elif len(matches) > 0:
            best_match = sorted(matches, key = lambda p: elh.score_match(p), reverse = True)[0]
            self.provenance = self.keywords[extract_category_keyword(best_match)]
    def __repr__(self):
        return f'{self.category} [{self.keywords.keys()}]'

class ReferenceMatch:
    '''
    The established link to an entity, with provenance and optional score.
    '''
    def __init__(self, label: EntityLabel, provenance: [EntityProvenance], score=0):
        self.label = label
        self.provenance = provenance
        self.score = score
    def __repr__(self):
        return self.label.name

for r in references:
    if r.text in reference_links:
        reference_links[r.text].append(r)
    else:
        reference_links[r.text] = [r]

for r in reference_links:
    reference_links[r] = ReferenceLink(r, reference_links[r])

print(f'{len(references)-len(reference_links)}/{len(references)} ontdubbeld; {len(reference_links)} over)')

In [None]:
[
    (r, len(reference_links[r])) for r in sorted(reference_links, reverse=True, key=lambda r: len(reference_links[r]))
    if 'Huis' in reference_links[r].text
    #if len(reference_links[r].matches) == 0
][0:40]
#len([r for r in reference_groups if 'hof' in r.lower()])

In [None]:
f'{len(references)}, {len(reference_links)}'

## Indelen in categorieën

In [None]:
CATEGORY_FILE = '../criteria/ORG/org-groups.tsv'

cat_base = { }

with open(CATEGORY_FILE) as file:
    for row in csv.reader(file, delimiter='\t'):
        if len(row) >= 1 and row[0][0] != '#':
            [kw, nrm, cls] = row
            kw, _ = elh.harmonise_spelling(kw)
            if cls not in cat_base:
                cat_base[cls] = []
            if nrm != '':
                cat_keyword_normalisations[kw] = elh.harmonise_spelling(nrm)[0]
            cat_base[cls].append(kw)

cat_base = [ { 'label': cls, 'phrase': keywords[0], 'variants': keywords[1:] }
                  for (cls, keywords) in cat_base.items() ]

category_list = [ bi['label'] for bi in cat_base ]

cat_fuzzy_config = {
    'ngram_size': 3,
    'skip_size': 1,
    'include_variants': True,
    'ignorecase': True,
    'char_match_threshold': 0.7,
    'ngram_threshold': 0.7,
    'levenshtein_threshold': 0.7,
}

cls_case_searcher = elh.make_searcher(cat_fuzzy_config, cat_base)

print(f'{len(cat_base)} categories with {sum([(len(l["variants"])+1) for l in cat_base])} keywords')

In [None]:
def make_category(text, match):
    prov = EntityProvenance.from_match(text, match, conclusion = f'Intermediate classification: {match.label}')
    return ReferenceCategory(match.label, prov)

def classify_orgs(r):
    # Special case: reference too short
    if len(r) < 5:
        return (r, [('ongeldig', EntityProvenance(r, 'Reference length', '< 5', 'Not a valid reference'))])
    # The classifier (note we do not choose the best matches: there will be doubles)
    matches = cls_case_searcher.find_matches(r, debug=0, use_word_boundaries=False)
    categories = { }
    for m in matches:
        if m.label not in categories:
            categories[m.label] = []
        categories[m.label].append(m)
    for cat in categories:
        categories[cat] = ReferenceCategory(r, categories[cat])
    return (r, categories)

def save_classifications(res):
    for (r, categories) in res:
        reference_links[r].categories = categories

In [None]:
elh.search_in_parallel(reference_links, classify_orgs, save_classifications)

## Lussen over de categorieën

In [None]:
def select_items_in_category(cat):
    '''
    Returns a newly-constructed, sorted list.
    '''
    if cat == 'overig':
        links = [ (r, rl)
            for (r, rl) in reference_links.items()
            if len(rl.categories) == 0
        ]
    else:
        links = [ (r, rl)
            for (r, rl) in reference_links.items()
            if cat in rl.categories
        ]
    links.sort(reverse=True, key=lambda t: len(t[1]))
    return links

def select_refs_in_category(cat):
    'Only the reference texts (for use in the fuzzy searchers).'
    return [ r for (r,_) in select_items_in_category(cat) ]

In [None]:
# Eerste filterstappen

nr_removed = 0

for (r,rl) in reference_links.items():
    if 'rvs' in rl.categories:
        # Raden van Steden zijn geen Raden van State
        if 'Stat ' in r or ' St ' in r:
            del rl.categories['rvs']
            nr_removed += 1
        # Raad van State is geen andere raad
        if 'raad' in rl.categories:
            del rl.categories['raad']
            nr_removed += 1
    # Weeshuizen zijn geen kerken
    if 'kerk' in rl.categories and 'Weesh' in r:
        del rl.categories['kerk']
        nr_removed += 1
    # Een battaille is geen battaillon
    if 'militair' in rl.categories and 'Battaille' in r:
        del rl.categories['militair']
        nr_removed += 1
    # Gecommitteerde raden zijn provinciebestuur
    if 'raad' in rl.categories and 'Gecommitteerde' in r:
        del rl.categories['raad']
        nr_removed += 1
    # Regimenten zijn geen vorsten
    if 'vorst' in rl.categories and 'militair' in rl.categories:
        del rl.categories['vorst']
        nr_removed += 1

print(f'{nr_removed} category assignments removed')

## Statistiek over de categorieën

In [None]:
def get_statistics(cat=None):
    return sorted([
        { 'Klasse': cls, 'Uniek': len(l), 'Totaal': sum(l), 'Herkend': len(h), 'Totaal herkend': sum(h),
        'Verdeling': sum(l)/len(l),
        'VanTotaal': sum(h)/sum(l),
        'VanUniek': len(h)/len(l), #len([n for n in h if n > 1])/len([n for n in l if n > 1]),
        }
        for (cls,l,h) in
        [
            (cls, [ len(rl) for (_, rl) in items],
                  [ len(rl) for (_, rl) in items if len(rl.matches) > 0])
            for (cls, items) in [ 
                (cls, select_items_in_category(cls))
                for cls in category_list + ['overig']
                if cat is None or cat == cls
            ]
            if len(items) > 0
        ]
    ], reverse=True, key=lambda e: e['Totaal'])

def show_statistics(cat=None):
    stats = get_statistics(cat)
    display(HTML(f'''<table><tr>
    <tr><th>Klasse</th><th>Uniek</th><th>Totaal</th><th>Verdeling</th>
    <th>Herkend</th><th>Totaal</th><th>%Totaal</th><th>%Uniek</th></tr>
    {'</tr><tr>'.join([f"""
        <td>{r['Klasse']}</td>
        <td>{r['Uniek']}</td>
        <td>{r['Totaal']}</td>
        <td>{r['Verdeling']:.2f}</td>
        <td>{r['Herkend']}</td>
        <td>{r['Totaal herkend']}</td>
        <td>{r['VanUniek']:.0%}</td>
        <td>{r['VanTotaal']:.0%}</td>
    """ for r in stats])}
    </tr></table>'''))

show_statistics()

## Algemeen deel voor de categorieën

In [None]:
org_fuzzy_config = {
    'ngram_size': 3,
    'skip_size': 1,
    'include_variants': True,
    'ignorecase': True,
    'char_match_threshold': 0.7,
    'ngram_threshold': 0.7,
    'levenshtein_threshold': 0.7,
}

class Instelling:
    def __init__(self, row):
        # labels commentaar naam [zoektermen]
        self.name = row[2]
        labels = set(re.split(' +- +', row[0]))
        self.entity = EntityLabel(self.name, 'ORG', labels)
        self.keywords = list(set(elh.harmonise_spelling(s)[0] for s in row[3:]))
    def __repr__(self):
        return self.name
    def as_phrase_model(self):
        return { 
            'label': self.name,
            'phrase': self.keywords[0],
            'variants': self.keywords[1:] }

def make_orglist(filename):
    orglist = { }
    with open('../criteria/ORG/'+filename) as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader) # skip header
        for row in reader:
            if len(row) > 3 and row[0] != '#':
                c = Instelling(row)
                orglist[c.name] = c
    print(f'{len(orglist)} instellingen met {sum([len(c.keywords) for c in orglist.values()])} zoektermen')
    return orglist

def make_searcher(orglist):
    return elh.make_searcher(org_fuzzy_config, [i.as_phrase_model() for i in orglist.values()])

## Categorie: overig

In [None]:
orgs_overig = make_orglist('org-overig.tsv')
overig_searcher = make_searcher(orgs_overig)

def find_overig(r):
    matches = overig_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_overig(res):
    for (r, matches) in res:
        for m in elh.choose_best_matches(matches):
            reference_links[r].add_match(ReferenceMatch(
                    orgs_overig[m.label].entity,
                    [ EntityProvenance.from_match(r, m) ], # No group, hence no group provenance
                    elh.score_match(m)))
        

In [None]:
elh.search_in_parallel(select_refs_in_category('overig'), find_overig, save_overig)

In [None]:
show_statistics('overig')
[ # Analyse
    f'{r} ({len(rl)})'
    for (r, rl) in select_items_in_category('overig')
    if len(rl.matches) == 0
    if 'Vroen' in rl.text
]

## Categorie: raad van state

In [None]:
orgs_rvs = make_orglist('org-rvs.tsv')
rvs_searcher = make_searcher(orgs_rvs)

rvs_entity = EntityLabel('Raad van State', 'ORG', set(['Republiek','Landelijk','Generaliteit','Bestuur']))

def find_rvs(r):
    matches = rvs_searcher.find_matches(r, debug=0)
    return (r, elh.choose_best_matches(matches))

def save_rvs(res):
    for (r, matches) in res:
        rl = reference_links[r]
        rvs_prov = rl.categories['rvs'].provenance
        labels = [ m.label for m in matches ]
        if 'ongeldig' in labels:
            match = [ m for m in matches if m.label == 'ongeldig' ][0]
            rl.categories = { 'ongeldig': ReferenceCategory(r, [],
                    EntityProvenance.from_match(r, match,
                        conclusion = f'Not a valid organisation: {match.label}')) }
            continue
        if 'geen_rvs' in labels:
            del rl.categories['rvs']
            continue
        if len(matches) > 0:
            for m in elh.choose_best_matches(matches):
                rl.add_match(ReferenceMatch(
                        orgs_rvs[m.label].entity,
                        [rvs_prov, EntityProvenance.from_match(r, m) ],
                        elh.score_match(m)))
        else:
            rl.add_match(ReferenceMatch(
                    rvs_entity, [ rvs_prov ], 0 ))

# TODO aan einde: state van eruit

In [None]:
elh.search_in_parallel(select_refs_in_category('rvs'), find_rvs, save_rvs)

In [None]:
show_statistics('rvs')

[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('rvs')
    #if len(rl.matches) == 1
    #if 'Raad van State' not in [ m.label for m in reference_links[r].matches ]
    #if len(reference_links[oc.text].matches) == 0
    #if 'Conseil' in r
]

## Categorie: rekenkamer

In [None]:
orgs_reken = make_orglist('org-rekenkamer.tsv')
reken_searcher = make_searcher(orgs_reken)

def find_rekenkamers(r):
    matches = reken_searcher.find_matches(r, debug=0)
    return (r, elh.choose_best_matches(matches))

def save_rekenkamers(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['rekenkamer'].provenance
        labels = [ m.label for m in matches ]
        if 'ongeldig' in labels:
            del rl.categories['rekenkamer']
            continue
        if 'Generaliteitsrekenkamer' not in labels:
            if 'gen' not in r and 'G' not in r:
                del rl.categories['rekenkamer']
            elif 'Provinc' in r:
                del rl.categories['rekenkamer']
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_reken[m.label].entity,
                [cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

In [None]:
elh.search_in_parallel(select_refs_in_category('rekenkamer'), find_rekenkamers, save_rekenkamers)

In [None]:
show_statistics('rekenkamer')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('rekenkamer')
    if len(rl.matches) == 0
    #if 'Generaliteitsrekenkamer' not in [m.label.name for m in rl.matches]
]

## Categorie: munt

In [None]:
orgs_munt = make_orglist('org-munt.tsv')
munt_searcher = make_searcher(orgs_munt)

munt_entity = EntityLabel('Generaliteitsmunt', 'ORG', set(['Republiek','Landelijk','Munt','Financiën','Generaliteit']))

def find_munt(r):
    matches = munt_searcher.find_matches(r, debug=0)
    return (r, elh.choose_best_matches(matches))

def save_munt(res):    
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['munt'].provenance
        if len(matches) > 0:
            for m in elh.choose_best_matches(matches):
                rl.add_match(ReferenceMatch(
                    orgs_munt[m.label].entity,
                    [ cat_prov, EntityProvenance.from_match(r, m) ],
                    elh.score_match(m)))
        elif 'generaliteit' in rl.categories:
            munt_prov = rl.categories['munt'].provenance
            gen_prov = rl.categories['generaliteit'].provenance
            gen_prov.conclusion = 'Assign entity: Generaliteitsmunt'
            del rl.categories['generaliteit']
            rl.add_match(ReferenceMatch(munt_entity, [ munt_prov, gen_prov ], 0 ))

In [None]:
elh.search_in_parallel(select_refs_in_category('munt'), find_munt, save_munt)

In [None]:
show_statistics('munt')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('munt')
    #if len(rl.matches) == 0
    if 'ortmu' in r
]

## Categorie: advocaat

In [None]:
orgs_advocaat = make_orglist('org-advocaat.tsv')
advocaat_searcher = make_searcher(orgs_advocaat)

def find_advocaat(r):
    matches = advocaat_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_advocaat(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['advocaat'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_advocaat[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('advocaat'), find_advocaat, save_advocaat)

show_statistics('advocaat')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('advocaat')
    if len(rl.matches) == 0
    #if 'slach' in r
]

## Categorie: parlement

In [None]:
orgs_parlement = make_orglist('org-parlement.tsv')
parlement_searcher = make_searcher(orgs_parlement)

def find_parlement(r):
    matches = parlement_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_parlement(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['parlement'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_parlement[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('parlement'), find_parlement, save_parlement)

show_statistics('parlement')

[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('parlement')
    if len(rl.matches) == 0
    #if 'Metz' in r
]

## Categorie: compagnie

In [None]:
orgs_compagnie = make_orglist('org-compagnie.tsv')
compagnie_searcher = make_searcher(orgs_compagnie)

def find_compagnie(r):
    matches = compagnie_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_compagnie(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['compagnie'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_compagnie[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))
        # Without match, likely to be a military company
        if len(matches) == 0 and 'Compagnie' in rl.categories['compagnie'].keywords and not 'militair' in rl.categories:
            rl.categories['militair'] = { 'Compagnie': rl.categories['compagnie'].keywords['Compagnie'] }

elh.search_in_parallel(select_refs_in_category('compagnie'), find_compagnie, save_compagnie)

In [None]:
show_statistics('compagnie')

[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('compagnie')
    if len(rl.matches) == 0
    #if 'Compagnie' in rl.categories['compagnie'].keywords
    #if 'Surinam' in r
]

## Categorie: kerk

In [None]:
orgs_kerk = make_orglist('org-kerk.tsv')
kerk_searcher = make_searcher(orgs_kerk)

def find_kerk(r):
    matches = kerk_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_kerk(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['kerk'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_kerk[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('kerk'), find_kerk, save_kerk)

In [None]:
show_statistics('kerk')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('kerk')
    if len(rl.matches) == 0
    #if 'Zas'  in r
]

## Categorie: militair

In [None]:
regiment_cleanup = [
    (r'^|de |het |t\' ?', ''),
    (r' in$', ''),
    (r'[sz]wit[sz](ch)?', 'swits'),
    (r'de[sz]en staat', 'den staat'),
    (r'\berust\b', 'ernst'),
    (r'casymir', 'casimir'),
    (r' te velde$', ''),
    (r'gu?ardes?', 'Guardes'),
    (r'infanter[yi]', 'Infantery'),
    (r'cavall?er[yi]', 'Cavallery'),
    (r'[sz]wit[sz]ers', 'Zwitsers'),
]
regiment_cleanup = [ (re.compile(pat, flags=re.I), repl)
        for (pat, repl) in regiment_cleanup ]

regiment_classes = [
    # Herkomst
    ('Zwitsers', 'zwitsers', 'Regimenten naar herkomst'),
    ('Schots', r'schot(s(ch)?e?|ten)', 'Regimenten naar herkomst'),
    ('Grisons', r'grisons', 'Regimenten naar herkomst'),
    ('Frans', r'frai?nch?oi?s(ch)?en?|frans(ch)?e?n?', 'Regimenten naar herkomst'),
    ('Waals', r'\bwaa?ls+(ch)?e?\b|walen', 'Regimenten naar herkomst'),
    ('Engels', r'engels|anglois', 'Regimenten naar herkomst'),
    ('Zweeds', r'swee[dt]t?s', 'Regimenten naar herkomst'),
    ('Deens', r'deens', 'Regimenten naar herkomst'),
    # Soorten
    ('Dragonders', r'dragonder', 'Regimenten naar wapen'),
    ('Artillerie', r'Artiller', 'Regimenten naar wapen'),
    ('Mineurs', r'mineur|sappeur', 'Regimenten naar wapen'),
    ('Cavallerie', r'cavall?er[yi]|te peer[dt]|te paar[dt]|hu[sz]s?aa?ren|ruiter', 'Regimenten naar wapen'),
    ('Infanterie', r'infanter[yi]|te voet|voetknecht|carabiniers|jagers|grenadier', 'Regimenten naar wapen'),
    ('Garderegimenten', 'guardes', 'Regimenten naar wapen'),
]
regiment_classes = [ (n, re.compile(p, flags=re.I), t)
        for (n,p,t) in regiment_classes ]

ignore_at_start = [
    'compagnien? ',
    r'batt?ai?ll?ons? ',
    'van ', 'he[ts] ', 'de(se)?n? ',
    'een ', 'twee ', 'drie ', 'vier ',
    'eerste ', 'tweede ', 'derde ', 'vierde ', 'vyfde ',
    'nieuwe ', 'afgedankte ',
]
start_regex = re.compile(rf'^({"|".join(ignore_at_start)}|t(\' ?| ))+', flags=re.I)

ranks = [
    r'generaal ', r'ma[jyi]oo?r ', r'brigadier ',
    r'co[lr][lr]?onn?els? ', r'o[uv]ersten? ', r'lieutenant ',
    r'wylen ',
    r'hee?ren? |van ',
    r'baron ', r'vr(y|ij)heere?n? ',
    r'gra[uv]e |graa?ff? ',
    r'herto(g|ch) ',
    r'keurvorst ',
    r'furst ', r'prin[sc](ch)?e? ',
    r'marquis ',
    r'ridders? ', r'welgeboo?ren',
]
rank_regex = re.compile(rf'\b(de[ns]? |{"|".join(ranks)})+', flags=re.I)

final_cleanup = [
    (r' (tot|te|binnen) ', ' tot '),
    (r' stat ', ' '),
    (r'regiment(van|leggende)', 'regiment'),
    (r'(troupp?es|regimenten|leger|arme[eé]|militie)(.*)staat', r'militie\2staat'),
    (r'guarnisoen tot ', 'guarnisoen '),
    (r'leggende ', ''),
    (r' +', ' '),
]
final_cleanup = [ (re.compile(pat, flags=re.I), repl) for (pat, repl) in final_cleanup ]

not_a_regiment = [
    'regiment( van( den?)?)?$',
    'l[eé]gislatif',
    'diplomatique',
    'krygsraat',
    'krygsgevangen',
    'van het selve regiment'
]
not_a_regiment = re.compile(rf'{"|".join(not_a_regiment)}', flags=re.I)

aggregated_regiments = { }

ent_hoge_krijgsraad = EntityLabel('Hoge Krijgsraad', 'ORG', set(['Republiek','Landelijk','Generaliteit','Rechtspraak','Oorlog']))
re_hoge_krijgsraad = re.compile('hoo?gen? kry[chg]+sraat', flags=re.I)

for (orig_ref, rl) in select_items_in_category('militair'):
    rl.matches = [ m for m in rl.matches if 'Regimenten op naam' not in m.label.labels ]
    r = orig_ref
    cat_prov = rl.categories['militair'].provenance
    # First cleanup
    for (pat, repl) in regiment_cleanup:
        r = pat.sub(repl, r)
    # Regiments by nation / kind
    for (tag, pat, lbl) in regiment_classes:
        m = pat.search(r)
        if not m: continue
        org_name = re.sub(f' +', ' ', f'Regimenten - {tag}')
        rl.add_match(ReferenceMatch(
            EntityLabel(org_name, 'ORG', set(['Oorlog', 'Regiment', 'Republiek', 'Landelijk', lbl])),
            [ cat_prov, EntityProvenance.from_match(r, m, conclusion = f'Assign {org_name}')], 1))
    # Further simplifications
    r = start_regex.sub('', r)
    r = rank_regex.sub('', r)
    for (pat, repl) in final_cleanup:
        r = pat.sub(repl, r)
    r = r.lower()
    # Drop known nonregiments
    if not_a_regiment.search(r):
        del rl.categories['militair']
        continue
    # Special case: the Hoge Krijgsraad
    if m := re_hoge_krijgsraad.search(r):
        rl.add_match(ReferenceMatch(ent_hoge_krijgsraad, [ cat_prov,
            EntityProvenance.from_match(r, m, conclusion = 'Assign entity: Hoge Krijgsraad') ], 1 ))
        continue
    # Aggregate
    if r not in aggregated_regiments:
        aggregated_regiments[r] = []
    aggregated_regiments[r].append(rl)
    # Special case: 'in guarnisoen'
    if m := re.search(r'(.*),? ?\bin (guarnisoen .*)', r):
        if m[1] not in aggregated_regiments:
            aggregated_regiments[m[1]] = []
        if m[2] not in aggregated_regiments:
            aggregated_regiments[m[2]] = []
        aggregated_regiments[m[1]].append(rl)
        aggregated_regiments[m[2]].append(rl)


In [None]:
sorted([ # Analysis of the left-over regiments
    (sum([len(rl)for rl in l]), len(l), r, sorted(l,reverse=True,key=lambda rl: len(rl))[0])
    for (r,l) in aggregated_regiments.items()
    #if sum([len(rl)for rl in l])>=10
    #if re.search('^guarnisoen maastricht', r)
    if 'guarnisoen maastricht' in r
], reverse=True, key=lambda x: x[0])

In [None]:
REGIMENT_CUTOFF = 8

for (r, rlist) in aggregated_regiments.items():
    # Apply a cutoff
    if sum([len(rl)for rl in rlist]) <= REGIMENT_CUTOFF:
        continue
    # Choose the most-popular name
    name = sorted(rlist, reverse=True, key=lambda rl: len(rl))[0].text
    name = re.sub('^(het|den?) ', '', name)
    name = re.sub(' +', ' ', name)
    entity = EntityLabel(name, 'ORG', set(['Oorlog', 'Regiment', 'Republiek', 'Landelijk', 'Regimenten op naam']))
    for rl in rlist:
        cat_prov = rl.categories['militair'].provenance
        rl.add_match(ReferenceMatch(entity, [ cat_prov,
            EntityProvenance(rl.text, 'Compare after simplification', r, f'Simplification equal to: {name}') ], 1))

In [None]:
show_statistics('militair')

[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('militair')
    if len(rl.matches) > 2
    #if 'Halquet' in r
]

## Categorie: admiraliteit

In [None]:
orgs_admiraliteit = make_orglist('org-admiraliteit.tsv')
admiraliteit_searcher = make_searcher(orgs_admiraliteit)

gez_adm_entity = EntityLabel('Gezamenlijke Admiraliteiten', 'ORG', set(['Admiraliteit','Zeevaart','Oorlog','Generaliteit','Republiek','Landelijk']))

def find_admiraliteit(r):
    orig_r = r
    r = re.sub('Amste[a-z]+', 'Amsterdam', r)
    r = re.sub('Rotte[a-z]+', 'Rotterdam', r)
    r = re.sub('Midde[a-z]+', 'Middelburg', r)
    r = re.sub('En[ck][a-z]+', 'Enkhuizen', r)
    r = re.sub('[ZS]eel([a-z]*t|a[a-z]+)', 'Zeelant', r)
    r = re.sub('D[ou]+[ck]+[oue]+m', 'Dokkum', r)
    matches = admiraliteit_searcher.find_matches(r, debug=0)
    return (orig_r, elh.choose_best_matches(matches))
        

def save_admiraliteit(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['admiraliteit'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_admiraliteit[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))
        if len(matches) == 0 and 'Collegien ter Admiraliteyt' in r:
            rl.add_match(ReferenceMatch(
                gez_adm_entity,
                [ cat_prov, EntityProvenance(r, 
                    'Contains ‘Collegien ter Admiraliteyt’ but no geographical indication',
                    'true', f'Assign entity: {gez_adm_entity.name}') ]))

In [None]:
elh.search_in_parallel(select_refs_in_category('admiraliteit'), find_admiraliteit, save_admiraliteit)

In [None]:
show_statistics('admiraliteit')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('admiraliteit')
    if len(rl.matches) == 0
    #if 'eel' in r
]

## Categorie: congres

In [None]:
orgs_congres = make_orglist('org-congres.tsv')
congres_searcher = make_searcher(orgs_congres)

def find_congres(r):
    matches = congres_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_congres(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['congres'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_congres[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('congres'), find_congres, save_congres)

show_statistics('congres')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('congres')
    if len(rl.matches) == 0
    #if 'Congreg' in r
]

## Categorie: diplomatiek

In [None]:
orgs_diplomatiek = make_orglist('org-diplomatiek.tsv')
diplomatiek_searcher = make_searcher(orgs_diplomatiek)

def find_diplomatiek(r):
    matches = diplomatiek_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_diplomatiek(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['diplomatiek'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_diplomatiek[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('diplomatiek'), find_diplomatiek, save_diplomatiek)

show_statistics('diplomatiek')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('diplomatiek')
    if len(rl.matches) == 0
    #if 'Sweeden' in r
]

## Categorie: secretariaat

In [None]:
orgs_secretariaat = make_orglist('org-secretariaat.tsv')
secretariaat_searcher = make_searcher(orgs_secretariaat)

def find_secretariaat(r):
    matches = secretariaat_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_secretariaat(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['secretariaat'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_secretariaat[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('secretariaat'), find_secretariaat, save_secretariaat)

show_statistics('secretariaat')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('secretariaat')
    if len(rl.matches) == 0
    #if '' in r
]

## Categorie: comptoir

In [None]:
orgs_comptoir = make_orglist('org-comptoir.tsv')
comptoir_searcher = make_searcher(orgs_comptoir)

def find_comptoir(r):
    matches = comptoir_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_comptoir(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['comptoir'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_comptoir[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('comptoir'), find_comptoir, save_comptoir)

show_statistics('comptoir')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('comptoir')
    if len(rl.matches) == 0
    #if '' in r
]

## Categorie: representanten

In [None]:
orgs_representanten = make_orglist('org-representanten.tsv')
representanten_searcher = make_searcher(orgs_representanten)

def find_representanten(r):
    matches = representanten_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_representanten(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['representanten'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_representanten[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('representanten'), find_representanten, save_representanten)

show_statistics('representanten')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('representanten')
    if len(rl.matches) == 0
    #if 'Ooster' in r
]

## Categorie: binnenland

In [None]:
wees_alg = EntityLabel('Weeshuizen algemeen', 'ORG', set(['binnenland','weeshuis']))

wees_entities = [
    ('Maastricht', EntityLabel('Weeshuizen in Maastricht', 'ORG', set(['Republiek','Plaatselijk','Weeshuis']))),
    ('Delft', EntityLabel('Weeshuizen in Delft', 'ORG', set(['Republiek','Plaatselijk','Weeshuis']))),
    ('Amsterdam', EntityLabel('Weeshuizen in Amsterdam', 'ORG', set(['Republiek','Plaatselijk','Weeshuis']))),
    ('Gravenhage', EntityLabel('Weeshuizen in ’s Gravenhage', 'ORG', set(['Republiek','Plaatselijk','Weeshuis']))),
    ('Suriname', EntityLabel('Weesmeesteren van Suriname', 'ORG', set(['Republiek','Plaatselijk','Weeshuis']))),
    ('Curacao', EntityLabel('Weesmeesteren van Curaçao', 'ORG', set(['Republiek','Wereld','Plaatselijk','Weeshuis']))),
]

school_entities = [
    ('Maastricht', EntityLabel('Latijnse school te Maastricht', 'ORG', set(['Republiek','Plaatselijk','Onderwijs']))),
    ('Hertogenbosch', EntityLabel('Latijnse school te ’s Hertogenbosch', 'ORG', set(['Republiek','Plaatselijk','Onderwijs']))),
]

univ_entities = [
    ('[lsz]e[iyn]den|lugd[ui]m', EntityLabel('Universiteit Leiden', 'ORG', set(['Republiek','Regionaal','Onderwijs']))),
    ('l[euo]+[uvw]en?', EntityLabel('Universiteit Leuven', 'ORG', set(['Europa','Zuidelijke Nederlanden','Regionaal','Onderwijs']))),
    ('utrecht', EntityLabel('Universiteit Utrecht', 'ORG', set(['Republiek','Regionaal','Onderwijs']))),
    ('gronin', EntityLabel('Universiteit Groningen', 'ORG', set(['Republiek','Regionaal','Onderwijs']))),
    ('douay', EntityLabel('Universiteit Douay', 'ORG', set(['Europa','Regionaal','Onderwijs']))),
    ('fran[ie]?[qk][ieu]+r|vrieslant', EntityLabel('Universiteit Franeker', 'ORG', set(['Republiek','Regionaal','Onderwijs']))),
    ('harderwyk', EntityLabel('Universiteit Harderwijk', 'ORG', set(['Republiek','Regionaal','Onderwijs']))),
]

gasthuizen = EntityLabel('Gasthuizen in de Republiek', 'ORG', set(['Republiek', 'Plaatselijk']))

def find_binnenland(r):
    rlower = r.lower()
    if 'gasthu' in rlower:
        return(r, gasthuizen, EntityProvenance(rlower, f'Matches /gasth/', 'true', f'Assign entity{gasthuizen.name}'))
    if 'weesh' in rlower or 'weesm' in rlower:
        for (kw, lbl) in wees_entities:
            if kw in r:
                return (r, lbl, EntityProvenance(rlower, f'Matches both /wees[hm]/ and /{kw}/', 'true', f'Assign entity{lbl.name}'))
    if 'latyns' in rlower:
        for (kw, lbl) in school_entities:
            if kw in r:
                return (r, lbl, EntityProvenance(rlower, f'Matches both /latyns/ and /{kw}/', 'true', f'Assign entity{lbl.name}'))
    if 'cademi' in rlower or 'univer' in rlower or 'faculte' in rlower or 'hooge school' in rlower:
        for (kw, lbl) in univ_entities:
            if re.search(kw, rlower):
                return (r, lbl, EntityProvenance(rlower, f'Matches both /cademi|univer|faculte/ and /{kw}/', 'true', f'Assign entity{lbl.name}'))


def save_binnenland(res):
    for (r, lbl, prov) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['binnenland'].provenance
        rl.add_match(ReferenceMatch(lbl, [cat_prov, prov], 1))

elh.search_in_parallel(select_refs_in_category('binnenland'), find_binnenland, save_binnenland)

In [None]:
show_statistics('binnenland')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('binnenland')
    if len(rl.matches) == 0
    #if 'Domeinen' not in r
    #if 'ongeldig' not in rl.categories
    #if len(rl)>1
    #if 'base' in r.lower()
]

## Categorie: plaats
NB deze moet aan het einde omdat we eerder herkende verwijzingen eruit filteren

In [None]:
plaatsen_per_locatie = {}

for (r, rl) in tqdm(select_items_in_category('plaats')):
    if len(rl.matches) > 0:
        continue # meestal: regenten van weeshuizen etc.
    plaatsen_per_locatie[r] = []
    for ref in rl.refs:
        if ref.location:
            plaatsen_per_locatie[r] += [ref.location]
    if len(plaatsen_per_locatie[r]) == 0:
        plaatsen_per_locatie[r] = None
        continue
    locs = { }
    for l in plaatsen_per_locatie[r]:
        l = elh.harmonise_spelling(l)[0]
        if l in locs:
            locs[l] += 1
        else:
            locs[l] = 1
    loc = sorted(locs.items(), key=lambda i: i[1])[-1][0]
    plaatsen_per_locatie[r] = loc
[
    (k,v)
    for (k,v) in plaatsen_per_locatie.items()
    #if len(v)==0
]

In [None]:
orgs_per_plaats = { }
for (r,l) in plaatsen_per_locatie.items():
    if l is None:
        continue
    l = re.sub(r'^(([Rr]yks )?[Ss]tat|[Ss]tede|[Ll]and)( van)? ','',l)
    l = re.sub(r'^[Ss]tede( van)? ','',l)
    if l.lower() == 'stat':
        continue
    if l in orgs_per_plaats:
        orgs_per_plaats[l].append(r)
    else:
        orgs_per_plaats[l] = [r]
len(orgs_per_plaats)

In [None]:
sorted(orgs_per_plaats.items(), reverse=True, key=lambda t: len(t[1]))[100:700]

In [None]:
for (l, rs) in orgs_per_plaats.items():
    if sum([len(reference_links[r]) for r in rs]) < 8: # PARAMETER TO ADJUST
        continue
    for r in rs:
        rl = reference_links[r]
        # Determine rough entity location
        loclabels = ['Bestuur', 'Plaatselijk']
        if location_info[l]['ModernCountry'] == 'Nederland':
            loclabels += ['Republiek']
        if location_info[l]['ModernCountry'] == 'België':
            loclabels += ['Europa', 'Zuidelijke Nederlanden']
        elif location_info[l]['Region'] == 'Europa':
            loclabels += ['Europa']
        else:
            loclabels += ['Wereld']
        if 'Kolonie' in location_info[l]['Labels'] or 'Plantage' in location_info[l]['Labels']:
            loclabels += ['Koloniaal']
        # Create the entity
        locname = location_info[l]['Name']
        el = EntityLabel(f'Plaatselijke overheid van {locname}', 'ORG', set(loclabels))
        el.links = [{ 'type': 'republic_entity', 'target': l, 'target_category': 'LOC', 
                     'description': f'zie ook de locatie {locname}'}]
        cat_prov = rl.categories['plaats'].provenance
        prov = EntityProvenance(r, 'Recognise place name', l, 'Assign to local government')
        rl.add_match(ReferenceMatch(el, [cat_prov, prov], 1))


In [None]:
show_statistics('plaats')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('plaats')
    if len(rl.matches) > 0
    if 'stat ' in r.lower()
    and 'der stat ' not in r.lower()
]

## Categorie: regio

In [None]:
orgs_regio = make_orglist('org-regio.tsv')
regio_searcher = make_searcher(orgs_regio)

def find_regio(r):
    matches = regio_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_regio(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['regio'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_regio[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('regio'), find_regio, save_regio)

In [None]:
show_statistics('regio')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('regio')
    if len(rl.matches) == 0
    #if 'Holl' in r
]

## Categorie: generaliteit

In [None]:
orgs_generaliteit = make_orglist('org-generaliteit.tsv')
generaliteit_searcher = make_searcher(orgs_generaliteit)

def find_generaliteit(r):
    matches = generaliteit_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_generaliteit(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['generaliteit'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_generaliteit[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('generaliteit'), find_generaliteit, save_generaliteit)

show_statistics('generaliteit')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('generaliteit')
    if len(rl.matches) == 0
    #if 'Griffie' in r
]

## Categorie: vorst

In [None]:
orgs_vorst = make_orglist('org-vorst.tsv')
vorst_searcher = make_searcher(orgs_vorst)

def find_vorst(r):
    matches = vorst_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_vorst(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['vorst'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_vorst[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('vorst'), find_vorst, save_vorst)

In [None]:
show_statistics('vorst')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('vorst')
    if len(rl.matches) >2
    #if 'Swee' in r
]

## Categorie: land

In [None]:
orgs_land = make_orglist('org-land.tsv')
land_searcher = make_searcher(orgs_land)

def find_land(r):
    matches = land_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_land(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['land'].provenance
        for m in elh.choose_best_matches(matches):
            rl.add_match(ReferenceMatch(
                orgs_land[m.label].entity,
                [ cat_prov, EntityProvenance.from_match(r, m) ],
                elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('land'), find_land, save_land)

In [None]:
show_statistics('land')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('land')
    if len(rl.matches) == 0
    #if 'Algie' in r
]

## Categorie: raad
moet na plaats

In [None]:
orgs_raad = make_orglist('org-raad.tsv')
raad_searcher = make_searcher(orgs_raad)

def find_raad(r):
    matches = raad_searcher.find_matches(r, debug=0)
    if len(matches) > 0:
        return (r, elh.choose_best_matches(matches))

def save_raad(res):
    for (r, matches) in res:
        rl = reference_links[r]
        cat_prov = rl.categories['raad'].provenance
        if len(matches) == 0:
            continue
        m = sorted(matches, key=elh.score_match)[-1]
        rl.add_match(ReferenceMatch(
            orgs_raad[m.label].entity,
            [ cat_prov, EntityProvenance.from_match(r, m) ],
            elh.score_match(m)))

elh.search_in_parallel(select_refs_in_category('raad'), find_raad, save_raad)

In [None]:
show_statistics('raad')
[ # Analyse
    f'{rl.refs[0].text} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('raad')
    if len(rl.matches) == 0
    if 'Cour' in r
]

## Categorie: ongeldig

In [None]:
show_statistics('ongeldig')
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in select_items_in_category('ongeldig')
    #if len(rl.matches) > 0
]

In [None]:
# Wat ergens herkend is, is niet ongeldig
for (r, rl) in select_items_in_category('ongeldig'):
    if len(rl.matches) > 0:
        del rl.categories['ongeldig']
    else:
        catlist = list(rl.categories.keys())
        for c in catlist:
            if c != 'ongeldig':
                del rl.categories[c]
show_statistics() 

In [None]:
[ # Analyse
    f'{r} ({len(rl)}): {rl.matches}'
    for (r, rl) in sorted(reference_links.items(), reverse=True, key=lambda t: len(t[1]))
    if 'ongeldig' in [m.label.name for m in rl.matches]
    if len(rl.matches) > 1
]

# Presentatie en uitvoer

## demo

In [None]:
entities = { }
refs_by_entity = { }
for (_, rl) in reference_links.items():
    for m in rl.matches:
        el = m.label
        lbl = el.name
        if lbl not in entities:
            entities[lbl] = el
            refs_by_entity[lbl] = []
        for r in rl.refs:
            em = EntityMatch(r.reference, m.provenance, el)
            refs_by_entity[lbl].append((r, em))
for lbl in refs_by_entity:
    refs_by_entity[lbl] = [ (r, m) for (r,m) in sorted(refs_by_entity[lbl], key=lambda t: t[0].year) ]

entity_names = [el.name for (_,el) in sorted(entities.items(), key=lambda t: t[0])]

entity_ids = { n : i for i,n in enumerate(entity_names) }

In [None]:
import matplotlib.pyplot as plt

for (n, refs) in tqdm(refs_by_entity.items()):
    continue; data = False # TODO
    if not data: continue
    plt.eventplot(data.keys(), data.values())
    plt.xticks(range(1576, 1796, 30))
    plt.gca().ticklabel_format(axis='y', style='plain') # TODO make this write integers
    plt.savefig(f'orgplots/{n}.png', dpi=150, format='png')
    plt.clf(); plt.cla(); plt.close()

In [None]:
html = ['''<!DOCTYPE html>
<html lang="nl">
<head>
<meta charset="utf-8">
<title>Commissies van de Staten-Generaal</title>
<style>
body { max-width: 100em; margin: auto; }
nav ul, .scroll { padding: 2px; margin: 1px; 
    resize: vertical; border: 1px solid gray; }
nav ul { display: inline-block; 
    height: 30em; min-width: 40em; overflow-y:scroll; 
    list-style-type: none; }
img { float:right; max-width: 40em; }
textarea { display: inline-block; overflow-y:scroll;
    min-height: 40em; }
.ib { display: inline-block; }
.scroll { overflow:scroll; white-space: nowrap; min-width:50%;
    min-height: 30em; max-height: 45em; max-width: 50%;
    display:inline-block; }
.scroll p { margin: 0pt; }
.threecol { column-count: 3; }
</style>
<body>
<h1>Herkende organisaties en instellingen</h1>

<p>Dit is de eerste proefversie van de herkende organisaties. Een paar opmerkingen als toelichting:
<ol>
<li> Er zijn veel organisaties. In het filter hieronder kun je op bepaalde categorieën filteren;
    dat lijkt me de handigste manier van navigeren.
<li> Er ontbreken ook nog een paar categorieën, i.h.b. de vorstenhoven en buitenlandse overheden.
<li> De plaatselijke overheden zijn in afwezigheid van geïdentificeerde locaties nogal improvisatoir:
er zijn bijvoorbeeld nogal wat doublures en de plaatsen zijn niet op binnen/buitenland ingedeeld.
Ook de namen zelf zijn voorlopig.
<li> Bij elke entiteit staat een lijstje labels. Die labels worden gebruikt in de filters.
Het uitbreiden en verbeteren van deze labels is één van de dingen waar ik jullie feedback over wil.
<li>Houd je muis boven een herkende instellingsverwijzing om te zien waar de herkenning op is gebaseerd.
</ol>

<h2>Inhoud</h2>

<nav id=mainnav><ol>
<li><a href="#plaats">Plaatselijke overheden</a>
<li><a href="#reg-naam">Regimenten op naam</a>
<li><a href="#reg-ovg">Overige regimenten</a>
<li><a href="#overig">Overige organisaties</a>
</ol></nav>

<form id="nav-filter"><fieldset><legend>Filter</legend>
<p><label for="niveau-select">Niveau:</label>
<select name="Niveau" id="niveau-select">
    <option value="">--alle--</option>
    <option value="Plaatselijk">Plaatselijk</option>
    <option value="Regionaal">Regionaal</option>
    <option value="Landelijk">Landelijk</option>
    <option value="onbekend">Onbekend</option>
</select>
<p><label for="waar-select">Waar:</label>
<select name="Waar" id="waar-select">
    <option value="">--alle--</option>
    <option value="Republiek">Republiek</option>
    <option value="Europa">Europa</option>
    <option value="Wereld">Wereld</option>
    <option value="onbekend">Onbekend</option>
</select>
<p><label for="cat-select">Categorie:</label>
<select name="Categorie" id="cat-select">
    <option value="">--alle--</option>
    <option value="Generaliteit">Generaliteit</option>
    <option value="Regiment">Regiment</option>
    <option value="Rechtspraak">Rechtspraak</option>
    <option value="Bestuur">Bestuur</option>
    <option value="Godsdienst">Godsdienst</option>
    <option value="Zeevaart">Zeevaart</option>
    <option value="Oorlog">Oorlog</option>
    <option value="Handel">Handel</option>
    <option value="Financiën">Financiën</option>
    <option value="overig">Overig</option>
</select>
<p><input type="submit" value="Filter">
</fieldset></form><script>
const filter = document.querySelector("#nav-filter");
filter.addEventListener("submit", (event) => {
    event.preventDefault();
    var entries = document
        .getElementById("filter-nav")
        .getElementsByTagName("div");
    var niv = filter["Niveau"].value;
    var waar = filter["Waar"].value;
    var cat = filter["Categorie"].value;
    for (e of entries) {
        e.style.display = "block";
        if (niv && !e.classList.contains("niv_"+niv))
            e.style.display = "none";
        if (waar && !e.classList.contains("waar_"+waar))
            e.style.display = "none";
        if (cat && !e.classList.contains("cat_"+cat))
            e.style.display = "none";
    }
});
</script>
<nav class=threecol id=filter-nav>
''' ]

nav_cats = [ 'Generaliteit', 'Oorlog', 'Rechtspraak', 'Bestuur', 'Godsdienst', 'Zeevaart', 'Handel', 'Financiën', 'Regiment' ]

for n in entity_names:
    el = entities[n]
    classes = []
    for cat in nav_cats:
        if cat in el.labels:
            classes.append(f'cat_{cat}')
    if len(classes) == 0:
        classes.append('cat_overig')
    if 'Republiek' in el.labels:
        classes.append('waar_Republiek')
    elif 'Europa' in el.labels:
        classes.append('waar_Europa')
    elif 'Wereld' in el.labels:
        classes.append('waar_Wereld')
    else:
        classes.append('waar_onbekend')
    if 'Plaatselijk' in el.labels:
        classes.append('niv_Plaatselijk')
    elif 'Regionaal' in el.labels:
        classes.append('niv_Regionaal')
    elif 'Landelijk' in el.labels:
        classes.append('niv_Landelijk') 
    else:
        classes.append('niv_onbekend')
    html.append(f'<div class="{" ".join(classes)}"><a href="#{entity_ids[n]}">{n}</a></div>')

html.append('</nav>')

seen_entities= { }
entity_partitions = [
#    ('plaats', 'Plaatselijk', 'Plaatselijke overheden'),
    ('reg-naam', 'Regimenten op naam', 'Regimenten op naam'),
    ('reg-ovg', 'Oorlog', 'Overige krijgszaken'),
    ('overig', 'overig', 'Overige organisaties')
]

for (id, lbl, heading) in entity_partitions:
    html.append(f'<h2 id="{id}">{heading}</h2>')
    for n in entity_names:
        el = entities[n]
        if n in seen_entities or not (lbl in el.labels or lbl == 'overig'):
            continue        
        seen_entities[n] = True
        html.append(f'''<h3 id="{entity_ids[n]}">{n}</h3>
        <p><a href="#mainnav">(terug naar boven)</a>
        <p>Labels: {" - ".join(el.labels)}''')
        html.append(f'<div class=scroll>')
        for (r, m) in refs_by_entity[n]:
            prov = "\n\n".join([
                f'source: {p.source}\nkeyword: {p.criterium}\nmatch: {p.outcome}'
                for p in m.provenance ])
            html.append(f'<p><span title="{prov}">')
            html.append(f'{r.year}: {r.reference.tag_text}</span>')
        html.append(f'</div>')

with open('demo-organisaties.html', 'w') as f:
    f.write('\n'.join(html))

## onherkend

In [None]:
html = ['''<!DOCTYPE html>
<html lang="nl">
<head>
<meta charset="utf-8">
<title>Commissies van de Staten-Generaal</title>
<style>
body { max-width: 100em; margin: auto; }
nav ul, .scroll { padding: 2px; margin: 1px; 
    resize: vertical; border: 1px solid gray; }
nav ul { display: inline-block; 
    height: 30em; min-width: 40em; overflow-y:scroll; 
    list-style-type: none; }
img { float:right; max-width: 40em; }
textarea { display: inline-block; overflow-y:scroll;
    min-height: 40em; }
.ib { display: inline-block; }
.scroll { overflow:scroll; white-space: nowrap; min-width:50%;
    min-height: 30em; max-height: 45em; max-width: 50%;
    display:inline-block; }
.scroll p { margin: 0pt; }
.threecol { column-count: 3; }
</style>
<body>
<h1>Onherkende organisatieverwijzingen</h1>
<p>In het bestand met de <em>herkende</em> organisaties zie je de verwijzingen zoals ze in de bron staan;
hier zie je daarentegen de genormaliseerde versies.
Zweef je muis eroverheen om de oorspronkelijke varianten te zien.
''']

# TODO: eigenlijke verwijzingen in tooltip
# TODO: statistiek bovenaan

html.append('<ul><nav id=mainnav>')
for cls in category_list + ['overig']:
    html.append(f'<li><a href="#{cls}">Categorie: {cls}</a>')
html.append('</ul></nav>')

html.append(f'<h2 id=onherkend>Onherkende verwijzingen</h2>')
for cls in category_list + ['overig']:
    stats = get_statistics(cls)[0]
    html.append(f'''<h3 id={cls}>Categorie ‘{cls}’</h3>
        <p><a href="#mainnav">(terug naar boven)</a>
        <p>Herkend: {stats['VanTotaal']:.0%} van het totaal, {stats['VanUniek']:.0%} van de unieke verwijzingen
        <div class=scroll>''')
    for r, rl in select_items_in_category(cls):
        if len(rl.matches) > 0:
            continue
        real = '\n'.join(set([ref.reference.tag_text for ref in rl.refs]))
        html.append(f'<p><span title="{real}">{len(rl)}x {r}</span>')
    html.append('</div>')

with open('onherkende-organisaties.html', 'w') as f:
    f.write('\n'.join(html))

## json-export

In [None]:
export = [
    EntityMatch(r.reference, m.provenance, m.label).json
    for rl in reference_links.values()
    for m in rl.matches
    for r in rl.refs
    if not m.label.name in ['ongeldig']
]
with open('ORG-annotations.joined.json', 'w') as f:
    f.write('['+','.join(export)+']')

## labels