# Koppelen van Commissies

## Inlezen commissieverwijzingen

In [None]:
%load_ext autoreload
%autoreload 2
import csv, re, os, sys
from tqdm.notebook import tqdm
from datetime import datetime
from multiprocess import Pool
from typing import Optional, Union
from IPython.display import display, HTML

repo_name = 'republic-project'
repo_dir = os.getcwd().split(repo_name)[0] + repo_name
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
    sys.path.append(f'{repo_dir}/enrichment/common')

from entity_match import *
import entity_linking_helpers as elh

CPU_CORES = 6

In [None]:
SESSIONS_FILE = f'{repo_dir}/data/entities/annotations-unaggregated/session_dates.tsv'

session_years = { }

with open(SESSIONS_FILE) as file:
    for row in csv.DictReader(file, delimiter='\t'):
        y = row['session_date'][0:4]
        if y == '':
            session_years[row['session_id']] = None
        else:
            session_years[row['session_id']] = int(y)

In [None]:
DATA_FILE = 'annotations-layer_COM'

class Reference:
    def __init__(self, row):
        self.text = row['improved_tag_text']
        del row['improved_tag_text']
        self.reference = EntityReference(**row)
        self.year = session_years[re.sub(r'-resolution.*', '', self.reference.resolution_id)]
        self.preside = None
        self.prev_provenance = []
    def __repr__(self):
        return f'({self.reference.inv}/{self.reference.paragraph_id}) {self.text}'

In [None]:
prev_prov = { }
with open(f'{DATA_FILE}.provenance.tsv') as file:
    for row in csv.reader(file, delimiter='\t'):
        if row[1] not in prev_prov:
            prev_prov[row[1]] = []
        prev_prov[row[1]].append(EntityProvenance(row[2],row[3],row[4],row[5]))

In [None]:
references = [ ]
with open(f'{DATA_FILE}.tsv') as file:
    fnr = 1
    for row in csv.DictReader(file, delimiter='\t'):
        fnr = fnr + 1
        ref = Reference(row)
        ref.prev_provenance = prev_prov.get(f'{fnr}', [])
        references.append(ref)

print(f'Read {len(references)} references')

In [None]:
# check years of references
print(f'{len([r for r in references if r.year is not None])} verwijzingen met en {len([r for r in references if r.year is None])} zonder jaartal')

## Harmoniseren van de spelling

In [None]:
spelling_regexes = [ ]

spelling_extra = [
#    (r' de[nrs]? ', ' '),
#    (r" (het|'s|een) ", ' '),
    (r'\b[sz]a[ae]+[ck]+en?\b', 'zaken'),
    (r' (meer|wel)ge(noemde|melde) ', ' '),
    (r' (ee?nige|enkele) ', ' '),
    (r'(^ +| +$)', '') ]

with open('../criteria/common/spelling-regexes.tsv') as file:
    for (pat, repl, _) in csv.reader(file, delimiter='\t'):
        spelling_regexes.append((re.compile(pat), repl))

for (pat, repl) in spelling_extra:
    spelling_regexes.append((re.compile(pat), repl))

def harmonise_spelling(s: str) -> str:
    for (pat, repl) in spelling_regexes:
        s = pat.sub(repl, s)
    return s

for r in tqdm(references):
    r.text = harmonise_spelling(r.text)

## Splitsen dubbele verwijzingen

In [None]:
re_dubbel = re.compile(r'\ben (aan|van|uit) (het|de)\b')
import copy

new_refs = []
for r in references:
    if r.year < 1780: continue
    while m := re_dubbel.search(r.text):
        (a, _, b) = r.text.partition(m[0])
        if len(a) < 25 or re.search(r'tot de [A-Za-z]+[ ,]*$', a):
            break # zak en van de
        print(a)
        print(b)
        new = copy.copy(r)
        r.text, new.text = b, a # NB de volgorde
        new_refs.append(new)

references += new_refs
print(f'{len(new_refs)} references split in two')

## Groeperen identieke verwijzingen

In [None]:
PRESIDE_REGEX = re.compile(r'^.*?en andere *')

reference_links = {}

class ReferenceLink:
    def __init__(self, text: str, refs: [Reference], matches=[]):
        self.text = text
        self.refs = refs
        self.matches = matches
    def __repr__(self):
        return self.text
    def __len__(self):
        return len(self.refs)

class ReferenceMatch:
    def __init__(self, label: EntityLabel, provenance: [EntityProvenance], score=0):
        self.label = label
        self.provenance = provenance
        self.score = score
    def __repr__(self):
        return self.label.name

for r in references:
    text = PRESIDE_REGEX.sub('', r.text)
    if text in reference_links:
        reference_links[text].append(r)
    else:
        reference_links[text] = [r]

for r in reference_links:
    reference_links[r] = ReferenceLink(r, reference_links[r])

print(f'{len(references)-len(reference_links)}/{len(references)} ontdubbeld; {len(reference_links)} over)')

## Inlezen commissiekenwoorden

In [None]:
commissies = { }

class Commissie:
    def __init__(self, row):
        self.name = row[0]
        labels = set(re.split(' *- *', row[1]))
        self.entity = EntityLabel(self.name, 'COM', labels)
        if row[2] != '': self.entity.comment = row[2]
        self.keywords = list(set(harmonise_spelling(s) for s in row[3:]))
    def __repr__(self):
        return self.name
    def as_phrase_model(self):
        return { 
            'label': self.name,
            'phrase': self.keywords[0],
            'variants': self.keywords[1:] }

with open('../criteria/COM/commissielijst.tsv') as file:
    file:next(file) # skip header
    for row in csv.reader(file, delimiter='\t'):
        if len(row) > 1:
            c = Commissie(row)
            commissies[c.name] = c

print(f'{len(commissies)} commissies met {sum([len(c.keywords) for c in commissies.values()])} zoektermen')

## Koppelen commissies

In [None]:
from fuzzy_search import PhraseModel, FuzzyPhraseSearcher

fuzzy_config = {
    'ngram_size': 3,
    'skip_size': 1,
    'include_variants': True,
    'ignorecase': True,
    'char_match_threshold': 0.77,
    'ngram_threshold': 0.77,
    'levenshtein_threshold': 0.77,
}

main_searcher = elh.make_searcher(fuzzy_config, [c.as_phrase_model() for c in commissies.values() if len(c.keywords)>0])

# een apart model voor de meestvoorkomende commissies, met hogere thresholds, als optimalisatie

strict_config = {
    'ngram_size': 3,
    'skip_size': 1,
    'include_variants': True,
    'ignorecase': True,
    'char_match_threshold': 0.85,
    'ngram_threshold': 0.85,
    'levenshtein_threshold': 0.85,
}

strict_subset = [
    'Zaken van de Zee',
    'Militaire zaken',
    'Financiele zaken',
    'Buitenlandse zaken',
    'Zaken van Vlaanderen',
    'Zaken van de Landen van Overmaze' ]

strict_searcher = elh.make_searcher(strict_config, [c.as_phrase_model() for c in commissies.values() if c.name in strict_subset ])

In [None]:
def find_committees(text):
    matches = strict_searcher.find_matches(text, debug=0)
    if not matches:
        matches = main_searcher.find_matches(text, debug=0)
    return (text, matches)

def save_committees(res):
    for (text, matches) in res:
        reference_links[text].matches = [
            ReferenceMatch(commissies[m.label].entity, [EntityProvenance.from_match(text, m)], elh.score_match(m))
            for m in elh.choose_best_matches(matches) ]

elh.search_in_parallel(reference_links, find_committees, save_committees, cores=elh.CPU_CORES)

In [None]:
[(r, r.matches) for r in reference_links.values()]

## Opschonen en speciale gevallen

In [None]:
# speciale gevallen en handmatige correcties van gelijkende zoektermen

re_invalid = re.compile(r'(van|raakende)( de|het)( [Ll]anden)?$|de zaken( van( de)?)?$')

lbl_inventien = commissies['Het stuk van de Octrooien op nieuwe Inventien'].entity
lbl_bedijking = commissies['Het stuk van de Octrooien van Bedijking'].entity
lbl_vlaamse = commissies['Examinatie van de Vlaamse Octrooien'].entity

for r in reference_links.values():

    # aan de rechterkant kan de match te kort zijn:

    r.matches = [
        m for m in r.matches
        if not re_invalid.search(m.provenance[0].outcome)
    ]

    text = r.text.lower()

    def in_coms(s):
        return any(s in m.label.name for m in r.matches)
    def drop_coms(s):
        r.matches = [m for m in r.matches if s not in m.label.name ]
    def drop_exact_com(s):
        r.matches = [m for m in r.matches if s != m.label.name ]

    # Octrooien uitsplitsen

    if in_coms('Octrooien'):
        r.matches = [ [ m for m in r.matches if 'Octrooien' in m.label.name ][0] ] \
                  + [ m for m in r.matches if 'Octrooien' not in m.label.name ]
        if 'invent' in text or 'nieuw' in text:
            r.matches[0].label = lbl_inventien
        elif 'bedyk' in text or 'dycag' in text:
            r.matches[0].label = lbl_bedijking
        elif 'vlaan' in text:
            r.matches[0].label = lbl_vlaamse

    # Below, only consider references with multiple matches
    
    if len(r.matches) <= 1:
        continue
    drop_coms('Ongeldig') # 'Ongeldig' is waarschijnlijk onterecht met meerdere matches
    if len(r.matches) <= 1:
        continue

    # Onterechte toewijzingen aan de meijerij

    if in_coms('Hertogenbosch') and ('goederen' in text or 'alimentatie' in text or 'geestel' in text):
        drop_coms('Hertogenbosch')
    
    # Scheiden van Oost en West
    
    if 'oost' in text and 'west' not in text:
        drop_coms('West')
    elif 'west' in text and 'oost' not in text:
        drop_coms('Oost')
    elif 'oost' in text and 'west' in text:
        r.matches = [ m for m in r.matches if 'Oost- en West-' in m.label.name ]

    # allerhande octrooien
    if in_coms('Bedijking') or in_coms('Inventien'):
        drop_exact_com('Het stuk van de Octrooien')

    # Militaire zaken van de Barriere

    if in_coms('Barrière'):
        drop_coms('Militaire')
    
    # Zaken van A en B

    for m in r.matches:
        extra = []
        if mv := re.search(r'Zaken van ([A-Z][a-z]+) en ([A-Za-z ]+)', m.label.name):
            extra = [mv[1], mv[2]]
        elif mv := re.search(r'Zaken van ([A-Z][a-z]+), ([A-Z][a-z]+) en ([A-Za-z ]+)', m.label.name):
            extra = [mv[1], mv[2], mv[3]]
        if len(extra) < 1 or extra == ['Stad', 'Lande']:
            continue
        for e in extra:
            c = f'Zaken van {e}'
            if c not in commissies:
                print(f'Waarschuwing: {c} bestaan niet.')
            r.matches = [m for m in r.matches if m.label.name != c ]

    if len(r.matches) <= 1:
        continue


In [None]:
# Prune unrecognised references

no_committee = [
    'Gedeputeerden in',
    'Gedeputeerden tot',
    'Gevolmachtichden in',
    'Gevolmachtichden tot',
    'Gevolmachtichden te Velde',
    'Gedeputeerden te Velde',
    'Gedeputeerden jonghst in',
]

lbl_buza = commissies['Buitenlandse zaken'].entity
lbl_mili = commissies['Militaire zaken'].entity
lbl_geen = commissies['Geen commissie'].entity

res = 0
for r in reference_links.values():
    if len(r.matches) >0 or 'zaken' in r.text or 'Zaken' in r.text:
        continue
    if 'uitenlan' in r.text:
        r.matches = [ ReferenceMatch(lbl_buza, [ EntityProvenance(r.text, 'Buitenlandsche', 'Buitenlandsche', 'Assign entity: Buitenlandse zaken') ], 1) ]
        res = res + 1
        continue      
    if 'ilitair' in r.text:
        r.matches = [ ReferenceMatch(lbl_mili, [ EntityProvenance(r.text, 'Militaire', 'Militaire', 'Assign entity: Militaire zaken') ], 1) ]
        res = res + 1
        continue      
    for c in no_committee:
        if c in r.text:
            res = res + 1
            r.matches = [ ReferenceMatch(lbl_geen, [ EntityProvenance(r.text, c, c, 'rejected') ], 1) ]
print(f'{res} unrecognised references reclassified')


In [None]:
[ (r.text, r.matches)
for r in reference_links.values()
if 'Buiten' in r.text]

In [None]:
# sorteer nu de matches op score -- later kiezen we de eerste als de beste

for r in reference_links.values():
    r.matches = sorted(r.matches, reverse=True, key=lambda m: m.score)

In [None]:
# Prune westindische zaken on year

wi_zaken = commissies['West-Indische zaken'].entity

wi_prov = EntityProvenance('(resolution year)', 'year < 1660', 'true', f'Assign entity: {wi_zaken.name}')
new_reflinks = {}

for r in reference_links.values():
    if len(r.matches)>0 and r.matches[0].label.name == 'Zaken van de West-Indische Kolonien':
        old = [ ref for ref in r.refs if ref.year < 1660 ]
        new = [ ref for ref in r.refs if ref.year > 1780 ]
        r.refs = new
        new_reflinks['*'+r.text] = ReferenceLink(r.text, old, [ ReferenceMatch(wi_zaken, r.matches[0].provenance + [wi_prov]) ])

for (k,v) in new_reflinks.items():
    reference_links[k] = v


In [None]:
# Prune Fijnaart on year

for r in reference_links.values():
    if len(r.matches)==0 or 'Fijnaart' not in r.matches[0].label.name:
        continue
    r.refs = [ ref for ref in r.refs if ref.year < 1670 ]

In [None]:
reference_links = { k : v for (k,v) in reference_links.items() if len(v.refs) > 0 }


## Voorzitters

In [None]:
voorzitters = {}

vz_fixes = [
    (r'^[Hh][a-z][a-z] ', ' '),
    (r'^[Hh]ee+ren *', ''),
    (r'^[Hh]ee+ren *', ''),
    (r'^[ 0-9]+', ''),
    (r'\b[A-Z]?aee [A-Z]elt\b', 'Raesfelt'),
    (r'\boye\b', 'Oye'),
    (r'(([oe])ee)', '\1e'),
    (r'H...uygens', 'Huygens'),
    (r'He[aeiou]u', 'Heu'),
    (r'\b.?uygens\b', 'Huygens'),
    (r'\b[a-z][a-z]{3,}', lambda m: m[0][0].upper() + m[0][1:]),
    (r'-', ' ')]

for r in references:
    r.preside = None
    has_pres = re.search(r"(([Dd]'[HhB]ee+re?n?|[HhB]ee+[rn]e?n?|[Bb]urgers?)[ -])? *(.*) en (d')?ander", r.text)
    if not has_pres:
        has_pres = re.search(r"(([Dd]'[HhB]ee+re?n?|[HhB]ee+[rn]e?n?|[Bb]urgers?)[ -])? *(.*) [Hh]a[ae]r\b", r.text)
    if has_pres and has_pres[3] != '' and not 'Committ' in has_pres[3]:
        pres = has_pres[3]
        for (pat, rpl) in vz_fixes:
            pres = re.sub(pat, rpl, pres)
        r.preside = pres
        if pres not in voorzitters:
            voorzitters[pres] = 1
        else:
            voorzitters[pres] += 1

print(f'{len([r for r in references if not r.preside])}/{len(references)} is zonder voorzitter;'
      +f' {len(voorzitters)} verschillende voorzitters')

[(k,voorzitters[k]) for k in sorted(voorzitters, key=lambda n: voorzitters[n], reverse=True)]
#[r.ref for r in references if not r.preside ]

## Analyse

In [None]:
# Analyse van zoektermen

matched_entities = { 'West-Indische zaken':0 }
matched_keywords = { }

for c in commissies.values():
    matched_entities[c.name] = 0
    for kw in c.keywords:
        if kw in matched_keywords:
            print(f'<b>! dubbele zoekterm:</b> {kw}')
        matched_keywords[kw] = 0

for r in reference_links.values():
    if not r.matches: continue
    match = r.matches[0] # de beste, die we zullen kiezen
    matched_entities[match.label.name] += 1
    for p in match.provenance:
        kw = p.criterium
        if kw not in matched_keywords: continue
        matched_keywords[kw] += 1

for c in commissies.values():
    print(f'{c.name} ({matched_entities[c.name]}):')
    for kw in c.keywords:
        print(f'  {matched_keywords[kw]:4d} {kw}')

In [None]:
# Analyse van dubbele resultaten
[
    (r.text, [f'{m} ({m.score})' for m in r.matches])
    for r in reference_links.values()
    if len(r.matches) > 1
]

In [None]:
#sorted(
[
    (re.sub(r'.*([sz]a[ae]*[ck]+|stu[ck]+)[^ ]* *', '', r.text), len(r.refs), r.refs[0].year) #, [f'{m} ({m.score})' for m in r.matches])
    for r in sorted(reference_links.values(), key=lambda x: x.refs[0].year)
    if (re.search(r'[sz]a[ae]*[ck]+', r.text)
    or re.search(r'stu[ck]+', r.text))
    and len(r.matches) == 0 # selecteer onherkende verwijzingen
    #and re.search(r'rh?[ijy]+n', r.text)
]#, reverse=True, key=lambda x: x[1])

## Presentatie

In [None]:
namen = list(commissies.keys())
namen.remove('Geen commissie')
namen.append('Geen commissie')
namen.remove('Ongeldig')
namen.append('Ongeldig')
namen.append('Onvolledig')
namen.append('Onherkend')

START_YEAR = 1590
TRANCHE_WIDTH = 10

refs_by_com = {}
pres_by_com = {}
presentatie = {}
for n in namen:
    presentatie[n] = {}
    refs_by_com[n] = []
    pres_by_com[n] = {}

for rgroup in reference_links.values():
    if rgroup.matches:
        c = rgroup.matches[0].label.name
        p = rgroup.matches[0].provenance[0]
    elif re.search(r'[Gg]edeputeerden( [Jj]ongh?st)?$|Hoog Mogende$|besoignes$|ter vergaderinge$|tot de zaken$', rgroup.text):
        c = 'Onvolledig'
        p = EntityProvenance(rgroup.text, 'none', 'none', 'possibly incomplete')         
    else:
        c = 'Onherkend'
        p = EntityProvenance(rgroup.text, 'none', 'none', 'not recognised') 
    for r in rgroup.refs:
        refs_by_com[c] += [(r, p)]
        if r.preside:
            if r.preside in pres_by_com[c]:
                pres = pres_by_com[c][r.preside]
                pres_by_com[c][r.preside] = (pres[0]+1, min(pres[1], r.year), max(pres[2], r.year))
            else:
                pres_by_com[c][r.preside] = (1, r.year, r.year)
        tranche = 5 + START_YEAR + ((r.year - START_YEAR)//TRANCHE_WIDTH)*TRANCHE_WIDTH
        if tranche not in presentatie[c]:
            presentatie[c][tranche] = 1
        else:
            presentatie[c][tranche] += 1

for n in refs_by_com:
    refs_by_com[n] = sorted(refs_by_com[n], key=lambda rp: int(rp[0].year))

In [None]:
print('Meestvoorkomende commissies:\n')
for c in sorted(refs_by_com, key=lambda c: len(refs_by_com[c]), reverse=True):
    #if len(refs_by_com[c]) < 100: break
    print(f'{len(refs_by_com[c]):4d} {c}')

In [None]:
import matplotlib.pyplot as plt
#import subprocess

#subprocess.Popen('mkdir -p plots/', shell=True)

for n in tqdm(namen):
    data = presentatie[n]
    if not data: continue
    plt.bar(data.keys(), data.values(), width=8)
    plt.xticks(range(START_YEAR, 1801, 30))
    #plt.gca().invert_xaxis()
    plt.savefig(f'plots/{n}.png', dpi=300, format='png')
    plt.clf(); plt.cla(); plt.close()

# plot van alle data
totalen = {}
for n in presentatie:
    for d in presentatie[n]:
        if d not in totalen:
            totalen[d] = 0
        totalen[d] += presentatie[n][d]
plt.bar(totalen.keys(), totalen.values(), width=8)
plt.xticks(range(START_YEAR, 1801, 30))
#plt.gca().invert_xaxis()
plt.savefig('plots/Alle-resoluties.png', dpi=600, format='png')
plt.clf(); plt.cla(); plt.close()

#subprocess.Popen('cd plots/; mogrify -monochrome *; optipng *.png', shell=True)

In [None]:
different_by_year = { }
geen_echte_commissie = [ 'Ongeldig', 'Geen commissie', 'Onherkend', 'Onvolledig' ]
for rgroup in reference_links.values():
    if not rgroup.matches: continue
    for r in rgroup.refs:
        if r.year not in different_by_year:
            different_by_year[r.year] = { }
        for m in rgroup.matches:
            if m.label.name not in geen_echte_commissie:
                different_by_year[r.year][m.label.name] = True
for y in different_by_year:
    different_by_year[y] = len(different_by_year[y])
different_by_year = sorted(different_by_year.items())
plt.plot(*zip(*different_by_year))
plt.savefig('verschillende_commissies_per_jaar.png', dpi=600, format='png')
plt.show()
different_by_year

In [None]:
html = ['''<!DOCTYPE html>
<html lang="nl">
<head>
<meta charset="utf-8">
<title>Commissies van de Staten-Generaal</title>
<style>
body { max-width: 100em; margin: auto; }
nav ul, .scroll { padding: 2px; margin: 1px; 
    resize: vertical; border: 1px solid gray; }
nav ul { display: inline-block; 
    height: 30em; min-width: 40em; overflow-y:scroll; 
    list-style-type: none; }
img { float:right; max-width: 40em; }
textarea { display: inline-block; overflow-y:scroll;
    min-width:60em; max-width: 80em; min-height: 40em; }
.ib { display: inline-block; }
.scroll { overflow:scroll; white-space: nowrap;
    min-height: 30em; max-height: 45em; max-width: 50%;
    display:inline-block; }
.scroll p { margin: 0pt; }
</style>
<body>
<h1>Commissies van de Staten-Generaal</h1>
<p> Versie 1 gepresenteerd op 3 oktober 2023
<p> Versie 2 gemaakt op 9 november 2023
<p> Versie 3 gemaakt op 31 oktober 2024
''',
'''<nav><div class=ib><h3>Inhoud</h3>
<ul>''']

for n in namen:
    if not presentatie[n]: continue
    html.append(f'<li><a href="#{n}">{n}</a>')
html.append('</ul></div><img src="plots/Alle-resoluties.png"></nav><hr>')

import base64
def make_data_url(filename):
    with open(filename, 'rb') as f:
        image = base64.b64encode(f.read())
        return 'data:image/jpeg;base64,' + image.decode('ascii')

for n in namen:
    if not presentatie[n]: continue
    html.append(f'<h2 id="{n}">{n} ({len(refs_by_com[n])})</h2>')
    html.append(f'<p><img src="{make_data_url(f"plots/{n}.png")}">')
    html.append(f'<div class=scroll>')
    for (r, p) in refs_by_com[n]:
        html.append(f'<p><span title="source: {p.source}\nkeyword: {p.criterium}\nmatch: {p.outcome}">')
        html.append(f'{r.year}: {r.reference.tag_text}</span>')
    html.append('</div>')
#    html.append('<p><textarea readonly wrap=off style="min-height:8em;">')
#    for vz in sorted(pres_by_com[n], key = lambda vz: pres_by_com[n][vz], reverse=True):
#        p = pres_by_com[n][vz]
#        html.append(f'{p[0]: 4d} ({p[1]}-{p[2]}): {vz}')
#    html.append('</textarea>')

with open('demo-commissies.html', 'w') as f:
    f.write('\n'.join(html))

## Splitsen samengestelde commissies

In [None]:
# TODO: oost- en w-i cie
for r in reference_links.values():
    r.matches = r.matches[:1]

## Export

In [None]:
export = [
    EntityMatch(r.reference, r.prev_provenance + m.provenance, m.label).json
    for rl in reference_links.values()
    for m in rl.matches
    for r in rl.refs 
    if not m.label.name in ['Ongeldig', 'Onherkend', 'Onvolledig', 'Geen commissie']
]
with open('COM-annotations.joined.json', 'w') as f:
    f.write('['+','.join(export)+']')