# Fuzzy Search and Petitioners' Roles and Professions

In [1]:
# This is needed to add the repo dir to the path so jupyter
# can load the republic modules directly from the notebooks
import os
import sys
repo_name = 'republic-project'
repo_dir = os.path.split(os.getcwd())[0].split(repo_name)[0] + repo_name
print(repo_dir)
if repo_dir not in sys.path:
    sys.path.append(repo_dir)



/Users/marijnkoolen/Code/Huygens/republic-project


In [2]:
# load the Republic Elasticsearch API
from republic.elastic.republic_elasticsearch import initialize_es

rep_es = initialize_es(host_type='external', timeout=60)


### Dictionary of resolution-specific terms

During the project we compile lists of terms that are relevant within the corpus of resolutions. The lists of terms are categorised, with lists for persons, organisations, objects, locations, topics, etc.

These terms can be used in fuzzy search to identify and classify different aspects of resolutions. For instance, the opening sentence of a resolution describes a proposition submitted to the States General. This proposition has a source type (often a specific type of document like a missive or a request) and details about who submitted, from when and on what date. Categorising these aspects allows us to add metadata to the individual resolutions with which we can improve information access. 

In [3]:
from republic.model.resolution_phrase_model import read_republic_term_dict

term_dict = read_republic_term_dict()

# What different categories of terms are available?
term_dict.keys()

dict_keys(['action', 'object', 'unit', 'meeting', 'person_reference', 'organisation', 'geographical_name', 'other_name', 'person_name', 'adjective', 'location', 'topic', 'date', 'misc', 'function'])

Most categories have sub-categories. E.g. the `person_reference` category distinguishes between professions, family relationships, legal status and titles.

In [4]:
term_dict['person_reference'].keys()

dict_keys(['person_legal_status', 'person_family', 'person_citizen', 'person_title', 'person_other', 'person_profession', 'person_meeting_role', 'person_nationality'])

In [5]:
# The number of profession terms in the dictionary
len(term_dict['person_reference']['person_profession'])

1039

These person reference terms can be added as a lexicon to a fuzzy searcher, so you can search for occurrences of these terms.

In [6]:
from fuzzy_search.fuzzy_context_searcher import FuzzyContextSearcher
from fuzzy_search.fuzzy_phrase_model import PhraseModel

config = {
    'levenshtein_threshold': 0.8,
    'ngram_threshold': 0.7,
    'ngram_size': 3,
    'skip_size': 1,
    'include_variants': True
}

phrases = []
for category in term_dict['person_reference']:
    for term in term_dict['person_reference'][category]:
        # turn the term into a fuzzy search phrase, and add its categories as labels
        phrase = {
            "phrase": term,
            "label": ["person_reference", category]
        }
        # add the term to the list of phrases
        phrases.append(phrase)
print("number of person reference phrases:", len(phrases))

number of person reference phrases: 1190


In [7]:
# Create a fuzzy search phrase model from the list of phrases
phrase_model = PhraseModel(phrases, config=config)
# configure a searcher
person_ref_searcher = FuzzyContextSearcher(config)
# Add the phrase model as lexicon to the searcher
person_ref_searcher.index_phrase_model(phrase_model)



In [8]:
# Create a query to select only resolutions in the year 1672 based 
# on propositions of type request
query = {
    "bool": {
        "must": [
            {"match": {"metadata.type": "resolution"}},
            {"match": {"metadata.session_year": 1672}},
            {"match": {"metadata.proposition_type": "requeste"}}
        ]
    }
}

resolutions = rep_es.retrieve_resolutions_by_query(query, size=1000)



Each request proposition starts with a fixed formula, followed by details of the proposer, location and date, and then a _proposition verb_ that introduces the content of the proposition. To identify the proposer's role or profession, we use the fuzzy searcher and the `person_reference` lexicon on the text between the opening formula and the _proposition verb_.

In [14]:
from collections import defaultdict
from collections import Counter

person_ref_freq = Counter()
person_ref_type_freq = Counter()
person_role = defaultdict(Counter)

obs = {
    "res": [],
    "term": [],
    "label": []
}

for res in resolutions:
    # the opening formula is always in the first paragraph
    first_para = res.paragraphs[0]
    
    # The resolution evidence consists of fuzzy search matches based
    # on the resolution opening phrase lexicon.
    # Select only the matches in the first paragraph
    first_para_matches = [match for match in res.evidence if match.text_id == first_para.id]    
    
    # From there, pick the first match phrase that is an opening formula. 
    # The end of the formula is the start of the 
    opening_match = [match for match in first_para_matches if match.has_label('proposition_opening')][0]
    proposition_start = opening_match.end
    
    # Then, pick the first proposition verb as the end of the proposition text,
    # or the end of the paragraph if there is no proposition verb
    verb_matches = [match for match in first_para_matches if match.has_label('proposition_verb')]
    proposition_end = verb_matches[0].end if len(verb_matches) > 0 else len(first_para.text)
    
    # Select the text of the first paragraph between the opening formula and the proposition verb
    proposition_text = first_para.text[proposition_start:proposition_end]
    print(proposition_text, '\n')
    
    # look for person reference terms
    matches = person_ref_searcher.find_matches(proposition_text)
    for match in matches:
        print(f"Phrase: {match.phrase.phrase_string: <30}\tmatch string: {match.string}")
        print(f"\t", match.label)
        person_ref_freq.update([match.phrase.phrase_string])
        # the label can be a single string or a list of strings
        refs = match.label if isinstance(match.label, list) else [match.label]
        for ref in refs:
            person_role[ref].update([match.phrase.phrase_string])
            obs["res"].append(res.id)
            obs["term"].append(match.phrase.phrase_string)
            obs["label"].append(ref)
        person_ref_type_freq.update(refs)
            

Johan Coorte, ende Gijsbert Zuijlen van Nieuvelt, beijde Schepenen ‛s Lants vanden Vrijen, versoeckende 

Phrase: Scheepenen                    	match string: Schepenen
	 ['person_reference', 'person_profession']
Jan van Eede geweest hebbende 

Johan d'Arbaij, Major van een regiment te voet, ten dienst deser Landen, guarnisoen houdende 

Phrase: Major                         	match string: Major
	 ['person_reference', 'person_profession']
Balthasar van geersbergen, Secretaris van Derssel, Wessen & Beersen alle inde Meijerije van s' Hertogenbos, houdende 

Phrase: Sekretaris                    	match string: Secretaris
	 ['person_reference', 'person_profession']
Phrase: Secretaris                    	match string: Secretaris
	 ['person_reference', 'person_profession']
N. Cauberecht, Licentiaet inde rechten tot Maestricht, houdende 

Phrase: Licentiaat                    	match string: Licentiaet
	 ['person_reference', 'person_profession']
Boulliu, Burgemeesteren ende Schepenen der Stede

Phrase: Burgemeester                  	match string: Burgemeesteren
	 ['person_reference', 'person_profession']
Phrase: Burgemeesteren                	match string: Burgemeesteren
	 ['person_reference', 'person_profession']
Phrase: Scheepenen                    	match string: Schepenen
	 ['person_reference', 'person_profession']
Johan Schoock, houdende 

Johannes Amilius, Commis Generael vande Convoijen ende Licenten, houdende 

Henrick Graham, Lieutenant Colonnel ten dienste deser Landen, houdende 

Phrase: Lieutenant                    	match string: Lieutenant
	 ['person_reference', 'person_profession']
Otto Grave van Limburch, ende Bronchorst, Heer van Stierum, houdende 

Phrase: Grave                         	match string: Grave
	 ['person_reference', 'person_title']
Phrase: Heer                          	match string: Heer
	 ['person_reference', 'person_title']
francisco van Lisidro, Coopman tot Amsterdam, houdende 

Phrase: Koopman                       	match string: Coopman
	 

Phrase: schippers                     	match string: schipper
	 ['person_reference', 'person_profession']
Phrase: Schipper                      	match string: schipper
	 ['person_reference', 'person_profession']
Phrase: Schepen                       	match string: Schepe
	 ['person_reference', 'person_profession']
Phrase: Suppliants                    	match string: Suppliant
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Suppliant
	 ['person_reference', 'person_meeting_role']
Phrase: Supplianten                   	match string: Suppliant
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliante                    	match string: Suppliant
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliant                     	match string: Suppliant
	 ['person_reference', 'person_meeting_role']
Phrase: Borger                        	match string: Borgers
	 ['person_reference', 'person_citizen']
Phrase: Borgers                      

Phrase: Scheepenen                    	match string: Schepenen
	 ['person_reference', 'person_profession']
Phrase: Armmeester                    	match string: armmeesters
	 ['person_reference', 'person_profession']
Phrase: Inwoonder                     	match string: Inwoonderen
	 ['person_reference', 'person_citizen']
Phrase: Inwoonders                    	match string: Inwoonderen
	 ['person_reference', 'person_citizen']
Frans Christoffel Muntz, gelicentieert Lieutenant Colonel inden Jaere 1668 houdende 

Phrase: Lieutenant                    	match string: Lieutenant
	 ['person_reference', 'person_profession']
Phrase: Collonel                      	match string: Colonel
	 ['person_reference', 'person_profession']
Phrase: Kolonel                       	match string: Colonel
	 ['person_reference', 'person_profession']
Jacques Alvares, ende Joseph frances Burgers ende Coopluijden tot Amsterdam, houdende 

Phrase: Burger                        	match string: Burgers
	 ['person_referenc

Phrase: Weduwe                        	match string: Weduwe
	 ['person_reference', 'person_legal_status']
Phrase: Suppliants                    	match string: Suppliante
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Suppliante
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliant                     	match string: Suppliante
	 ['person_reference', 'person_meeting_role']
Phrase: Supplianten                   	match string: Suppliante
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliante                    	match string: Suppliante
	 ['person_reference', 'person_meeting_role']
Nicolaes Reve Engelsch Coop„ man tot Rotterdam, versoeckende 

Willem Weijt, Schipper van,, den Schepe genaempt de Margriet, ende Jacob Balvert, Schipper vanden Schepe de Jacob uijt Schotlandt, ver„ soeckende 

Phrase: Schippers                     	match string: Schipper
	 ['person_reference', 'person_profession']
Phrase: Schipper           

Phrase: Burgemeester                  	match string: Burgermeester
	 ['person_reference', 'person_profession']
Phrase: Burgermeester                 	match string: Burgermeester
	 ['person_reference', 'person_profession']
Phrase: Heer                          	match string: Heer
	 ['person_reference', 'person_title']
Phrase: Ontfanger                     	match string: Ontfanger
	 ['person_reference', 'person_meeting_role']
Phrase: Ste                           	match string: Ste
	 ['person_reference', 'person_title']
Pr. Duijrcant, Stadthouder tot Oosterwijck, houdende 

Phrase: Stadhouder                    	match string: Stadthouder
	 ['person_reference', 'person_profession']
Jan van Groen Schipper van santen int Landt van Cleeff, hou„ dende 

Phrase: Schippers                     	match string: Schipper
	 ['person_reference', 'person_profession']
Phrase: Schipper                      	match string: Schipper
	 ['person_reference', 'person_profession']
mattheus Andresson, Willem Balf

Phrase: Graef                         	match string: Graeft
	 ['person_reference', 'person_title']
Phrase: Supplianten                   	match string: Sup„ plianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliants                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliant                     	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliante                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Supplianten                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliants                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Sup

Lambert Jans gewesen pachter vande Cool accijs tot Maestricht, versoeckende 

Phrase: Wachter                       	match string: pachter
	 ['person_reference', 'person_profession']
Petrus Hassenaer Predicant op Ets, leggende inde Luijder Zee, ontrent de Riviere van IJssel, te kennen gevende dat hij Suppliant nu een gansch Jaer het Woort Godts aldaer heeft geleert, ende alsnoch met een goeden ijver, daerin volherdende, dat oock, vermits het overgaen vande Provincie van Overijssel den Suppliant in allen dien tijt met een stuijver heeft genoten ende alsoo hij van hem selven geen middelen heeft om te connen sub„ sisteren, ende echter geen sijn ge,, meente in dese trouble tijden met heere ende onderwijsinge soude bij staen, Soo versochte den Suppliant, dat haer Ho:Mo: gei lieffden te ordonneren, dat hij gelijck als andere predicanten onder de Generaliteijt sijn tractement mochte ontfangen, ende daertoe een vast Comptoir aen hem werden aengewesen Waerop gedelibereert sijnde, Is goetge„ von

Phrase: Burgemeester                  	match string: Burgermeester
	 ['person_reference', 'person_profession']
Phrase: Burgermeester                 	match string: Burgermeester
	 ['person_reference', 'person_profession']
Phrase: Scheepenen                    	match string: Schepenen
	 ['person_reference', 'person_profession']
Phrase: Suppliants                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliant                     	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliante                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Supplianten                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: St                            	match string: St
	 ['person_reference', 'person_title']
Phrase: Princes      

Phrase: Suppliants                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliant                     	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliante                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Supplianten                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Directeur                     	match string: Directeurs
	 ['person_reference', 'person_profession']
Phrase: Directeuren                   	match string: Directeurs
	 ['person_reference', 'person_profession']
Phrase: Suppliants                    	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase: Suppliantes                   	match string: Supplianten
	 ['person_reference', 'person_meeting_role']
Phrase:

**Note**: this is a very coarse analysis, containing plenty of mistakes. Check output of especially resolutions where the first paragraph has no proposition verb, because then the entire paragraph is used and many person references will not be about the proposer.

In [10]:
for person_ref, freq in person_ref_freq.most_common():
    print(f"{person_ref: <30}{freq: >5}")

Schipper                         31
Schippers                        29
Koopman                          22
Schepen                          18
Supplianten                      18
Suppliants                       16
Suppliantes                      16
Suppliante                       16
Suppliant                        16
Scheepenen                       15
Heer                             14
Burgemeester                     12
Heere                            11
Bailliuw                         11
Borger                            9
Borgers                           9
Burgermeester                     9
Mr                                8
Scheepen                          7
Capitein                          7
Burger                            7
Burgers                           7
Burgemeesteren                    6
Griffier                          6
Commissaris                       6
Gedeputeerde                      6
Gedeputeerden                     6
Lieutenant                  

In [32]:
for person_ref_type, freq in person_ref_type_freq.most_common():
    print(f"{person_ref_type: <30}{freq: >5}")

person_reference                119
person_profession                75
person_title                     17
person_meeting_role              12
person_citizen                   12
person_family                     2
person_legal_status               1


In [13]:
for ref in person_role:
    if ref == "person_reference":
        continue
    for role, freq in person_role[ref].most_common():
        print(ref, role, freq)

person_profession Schipper 31
person_profession Schippers 29
person_profession Koopman 22
person_profession Schepen 18
person_profession Scheepenen 15
person_profession Burgemeester 12
person_profession Bailliuw 11
person_profession Burgermeester 9
person_profession Scheepen 7
person_profession Capitein 7
person_profession Burgemeesteren 6
person_profession Griffier 6
person_profession Commissaris 6
person_profession Lieutenant 6
person_profession schippers 5
person_profession Major 3
person_profession Secretaris 3
person_profession Consuls 3
person_profession Consul 3
person_profession Contrarolleur 3
person_profession Schout 3
person_profession Meester 3
person_profession Procureur 3
person_profession Sekretaris 2
person_profession Koningh 2
person_profession Batmeester 2
person_profession Poorter 2
person_profession Collonel 2
person_profession Commandeur 2
person_profession Agent 2
person_profession Visscher 2
person_profession Stadhouder 2
person_profession Wachter 2
person_profes

In [16]:
import pandas as pd

df = pd.DataFrame(obs)

In [18]:
df[df.res.str.contains("session-1672-10-29-num-1")]

Unnamed: 0,res,term,label
666,session-1672-10-29-num-1-resolution-3,Scheepen,person_reference
667,session-1672-10-29-num-1-resolution-3,Scheepen,person_profession
668,session-1672-10-29-num-1-resolution-3,Schepen,person_reference
669,session-1672-10-29-num-1-resolution-3,Schepen,person_profession
670,session-1672-10-29-num-1-resolution-3,Scheepen,person_reference
671,session-1672-10-29-num-1-resolution-3,Scheepen,person_profession
672,session-1672-10-29-num-1-resolution-3,Schepen,person_reference
673,session-1672-10-29-num-1-resolution-3,Schepen,person_profession
