In this notebook, we match the laws titles as yielded from the xslt processing to the manually annotated data. 
We can check to what extent the approprate titles have been found
Furthermore, we can connect the detected laws to the annotations in order to create labeled data. 

In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import io, os
import difflib 
import nltk, re
import random
import numpy as np
from collections import Counter

In [2]:
# load ground truth (gt)

with open('../categories/Gelderlandboek_cat3.csv', 'r') as f: 
    gt= pd.read_csv(f, sep=';')     

gt.rename(columns={'pdfpaginastart':'page', 'Volledigetiteltekst':'title', 'CodeMPIER1':'CODEMPIER1'}, inplace=True )
gt= gt.loc[gt.page!=' '] # file has empty lines with whitespace in cells
gt['page']=gt['page'].apply(lambda x: int(x)) # fix relevant dtypes
gt.tail()


Unnamed: 0,TranskribusID,NUMBERTEST,Gewest,Instrument,Instrumentoverige,Jaarhand,Maand,Dagnr,title,page,...,CODEMPIER1,CODEMPIER2,CODEMPIER3,CODEMPIER4,CODEMPIER5,CODEMPIER6,CODEMPIER7,CODEMPIER8,CODEMPIER9,CODEMPIER10
437,168882,460.0,1,15,6.0,1698,7,23,Waerschouwingh dat niemant gelt sal schieten o...,333,...,4ESP2G,,,,,,,,,
438,168882,461.0,1,1,,1699,1,6,"Placaet tegens lantlopers, heydens, vagabonden...",334,...,1SO3A,2PSO2J,2PSO2G,2PSO4J2,1SO3C,,,,,
439,168882,462.0,1,2,,1699,6,8,Resolutie invoerende testament-makinge binnen ...,335,...,1SO7A5,,,,,,,,,
440,168882,463.0,1,1,,1699,10,3,Placaet tegens de dubbelde stuyvers in Zeelant...,337,...,4ESP7D4,4ESP7D18,4ESP7D5,,,,,,,
441,168882,,1,3,,1699,11,9,Ordonnantie op het wegh-gelt van het Loo op Zu...,338,...,5LDP2B18,,,,,,,,,


In [3]:
# load processed info, the xslt output (doc_df) 
def iter_laws(root):
    for law in root.iter('Law'):
        law_dict = law.attrib
        law_dict['page']=int(law_dict['start-page'])
        law_dict['title'] =law_dict['title-prefix']
        law_dict['fulltext']=law.text
        yield law_dict

with open('process_page_phase2_output.xml','r') as f:
    etree = ET.parse(f)
    
doc_df = pd.DataFrame(list(iter_laws(etree.getroot())))

doc_df.head()

Unnamed: 0,fulltext,keyword,page,rel-size,start-page,start-region,title,title-prefix,years
0,"PLACATEN, ORDONNANTIEN ENDE RESOLUTIEN",0,0,0.0,0,0,Mock title that does not exist,Mock title that does not exist,
1,HET TWEEDE DEEL,false,11,3.5294117647058822,11,r_3_3,HET TWEEDE DEEL,HET TWEEDE DEEL,
2,Resolutie tegens het slaen der Hegh-munten DEE...,true,11,1.6470588235294117,11,r_4_1,Resolutie tegens het slaen der Hegh-munten,Resolutie tegens het slaen der Hegh-munten,
3,Placaet waer by alle Vasallen en Leenplichtige...,true,11,1.6470588235294117,11,r_7_1,Placaet waer by alle Vasallen en,Placaet waer by alle Vasallen en,1581.0
4,Placaet op den prys van den gelde en tegens he...,true,12,1.5294117647058822,12,r_3_1,Placaet op den prys van den,Placaet op den prys van den,


Let's see to what extent the same laws are in both dataframes.
NB:
- There may be errors in the titles of either one, so we use fuzzy matching based on string edit distance. 
- The xslt output only has the first line of the title

In [4]:
def match_lr(left, right):
    # find corresponding law in right for every entry in left
    for l_i, row in left.iterrows(): 
        # set index
        n=row.page 
        r_i = (right.page>n-3)&(right.page<n+3) # Candidates: only look in surrounding pages (+/- 2) 
        # to find corresponding law (both for efficiency and correctness)    

        if max(r_i)==False: # nothing to match with 
            left.loc[l_i,'closest_i']= -1
            continue

        # determine longest prefix that could match
        prefix = row.title[:max(right.loc[r_i,'title'].str.len())]
        # determine distance to candidates, only compare to longest prefix that could match: [:len(row.title)]
        # NB: penalty on substitution bigger than deletion because it might be a prefix
        distances=right.loc[r_i,'title'].apply(lambda x:  nltk.edit_distance(prefix, x[:len(row.title)], substitution_cost=3))

        ii = distances.idxmin() # index of the closest match in the right frame    
        if distances[ii] > 14: # match isn't good enough
            left.loc[l_i,'closest_i']= -1 
        else:
            left.loc[l_i,'closest_i'], left.loc[l_i,'distance']=ii, distances[ii] # save index + corresponding distance in left df
    

First, see if we can find a match for every title from the processing.

In [5]:
match_lr(doc_df, gt)
merged_1 = doc_df.add_suffix( '_L').merge(gt.add_suffix('_R'), how='left', left_on='closest_i_L', right_index=True)


In [6]:
# have a look at the worst matches
merged_1[['title_L', 'title_R', 'distance_L']].sort_values(by='distance_L', ascending=False).head(10)

Unnamed: 0,title_L,title_R,distance_L
244,Placaet waer by to0. Ryx-,Placaet waer by hondert ryxdaelers belooft wor...,14.0
401,Resolutie om by ’t Hoff in Leen-,"Resolutie dat 't Hoff in leen-saken vier, vyff...",14.0
309,Publicatie waer by Doctor Lu-,"Placaet waer by Doctor Lucas Harckens, en Gera...",12.0
344,Placaet tegens de beerlose vaga¬,"Placaet tegens de heerlose knechten, vagabodne...",12.0
324,Ordonnantie van den Hove en,Ordonnantie van Hoff en Reecken-kamer op het k...,12.0
291,Placaet rahende N aengebael-,Placaet raeckende 31 aengehaelde coper-vaten t...,12.0
158,Kercken- ordeninghe s goet ge-,Kercken-ordeninge goet-gevonden ende gearreste...,12.0
179,Resolutie dat den krychs-rtaet,Resolutie dat den Krygs-Raet geen jurisdictie ...,10.0
91,Placaet om te wicarien te ver¬,Placaet om de vicaryen te vergeven ende de ter...,8.0
375,Placaet op het wisschen in de,Placaet over het visschen in de Zuyder Zee.,8.0


In [7]:
missing_links = merged_1[merged_1.distance_L.isnull()] # false positives or problems in gt?
ml = len(missing_links)
print("Precision is "+ str(1-ml/len(doc_df)) +".", ml, "missing links (False positives):" )
print(missing_links[['title_L', 'title_R', 'distance_L']])

Precision is 0.9366812227074236. 29 missing links (False positives):
                                         title_L title_R  distance_L
0                 Mock title that does not exist     NaN         NaN
1                                HET TWEEDE DEEL     NaN         NaN
83                  Op-ten tweden van profanatie     NaN         NaN
103                    Placaet van myn Heeren de     NaN         NaN
104     Staten Generael van de Vereentrde Neder-     NaN         NaN
139               Artyckelen van den Gelderschen     NaN         NaN
148                Son-en feest ende bede-dagen.     NaN         NaN
159          Van de Kerckelycke t samen-kemsten.     NaN         NaN
160             Volgen de articulen daer inne de     NaN         NaN
239            Nxvidkt uyttet reces-sito Lqinst-     NaN         NaN
259        BBaer en Latum aen de Graefschap Zut¬     NaN         NaN
272              Exrract uyttet reces des Landt¬     NaN         NaN
277                               

Now, see if we can find a match for every title in the gold truth

In [8]:
match_lr(gt, doc_df)
merged_2 = gt.add_suffix( '_L').merge(doc_df.add_suffix( '_R'), how='left', left_on='closest_i_L', right_index=True)

In [9]:
missing_links = merged_2[merged_2.distance_L.isnull()] # false negatives
# because of processing or HTR errors?
# (or problems in gt?)
ml = len(missing_links)
print("Recall is "+ str(1-ml/len(gt)) +".", ml, "missing links (False positives):" )
print(len(missing_links), "missing links (False negatives): ")
print(missing_links[['title_L', 'title_R', 'distance_L']])

Recall is 0.9570135746606335. 19 missing links (False positives):
19 missing links (False negatives): 
                                               title_L title_R  distance_L
2    Placaet op den prys van den gelde en tegens he...     NaN         NaN
74   Placaet tegens eenige nieuwe heele en halve sc...     NaN         NaN
77   Resolutie dat registers en leger-boecken over ...     NaN         NaN
90              Resolutie aengaende de keursmatigheyt.     NaN         NaN
104  Placaet van haer Hoogh Mogende op de paspoorte...     NaN         NaN
110  Placaet tegens vagabonden, lantlopers, bedelae...     NaN         NaN
135  Resolutie inhoudnede dat tienden geen schattin...     NaN         NaN
141  Eerste en tweede affischeyt van wegen de Heere...     NaN         NaN
148  Placeaet tegens de gestrooyde lasteringen en c...     NaN         NaN
243  Placaet waer by hondert ryxdaelers belooft wor...     NaN         NaN
254     Placaet van de twee Jaer-merckten tot Zutphen.     NaN         N

In [10]:
merged_3 = doc_df.add_suffix( '_L').merge(gt.add_suffix('_R'), how='inner', left_on='closest_i_L', right_index=True)

In [11]:
merged_3

Unnamed: 0,fulltext_L,keyword_L,page_L,rel-size_L,start-page_L,start-region_L,title_L,title-prefix_L,years_L,closest_i_L,...,CODEMPIER3_R,CODEMPIER4_R,CODEMPIER5_R,CODEMPIER6_R,CODEMPIER7_R,CODEMPIER8_R,CODEMPIER9_R,CODEMPIER10_R,closest_i_R,distance_R
2,Resolutie tegens het slaen der Hegh-munten DEE...,true,11,1.6470588235294117,00000011,r_4_1,Resolutie tegens het slaen der Hegh-munten,Resolutie tegens het slaen der Hegh-munten,,0.0,...,,,,,,,,,2.0,0.0
3,Placaet waer by alle Vasallen en Leenplichtige...,true,11,1.6470588235294117,00000011,r_7_1,Placaet waer by alle Vasallen en,Placaet waer by alle Vasallen en,1581,1.0,...,,,,,,,,,3.0,10.0
4,Placaet op den prys van den gelde en tegens he...,true,12,1.5294117647058822,00000012,r_3_1,Placaet op den prys van den,Placaet op den prys van den,,2.0,...,4ESP7D11,,,,,,,,-1.0,
5,"Placaet op ’t vangen der vyanden, en tegens he...",true,15,1.1,00000015,r_1_2,Placaet op ’t vangen der vyan-,Placaet op ’t vangen der vyan-,,3.0,...,2PSO2E2,2PSO2E7,2PSO2K4,,,,,,5.0,5.0
6,Placaet inhoudende confiscatie en verval der l...,true,16,1.5294117647058822,00000016,r_1_2,Placaet inhoudende confiscatie,Placaet inhoudende confiscatie,,4.0,...,2PSO4A18,2PSO2E2,2PSO4K4,2PSO4K8,2PSO2J,,,,6.0,1.0
7,Placaet van geene als de ware Evangelische Apo...,true,16,1.5294117647058822,00000016,r_4_1,Placaet van geene als de ware,Placaet van geene als de ware,1582,5.0,...,,,,,,,,,7.0,2.0
8,Placaet waer by de Patroni werden gelaft baer ...,true,17,0.9411764705882353,00000017,r_3_1,Placaet waer by de Patroni wer-,Placaet waer by de Patroni wer-,1552,6.0,...,,,,,,,,,8.0,2.0
9,Placaet waer by de geene die het met de Spaens...,true,17,1.5294117647058822,00000017,r_5_1,Placaet waer by de geene die,Placaet waer by de geene die,,7.0,...,1SO2K3,2PSO4I,,,,,,,9.0,3.0
10,Placaet tot invoeringh ende uvorderingh der ge...,true,20,1.6470588235294117,00000020,r_3_1,Placaet tot invoeringh ende u¬,Placaet tot invoeringh ende u¬,,8.0,...,,,,,,,,,10.0,6.0
11,Resolutie dat in spolio violento den denlegger...,true,21,1.1,00000021,r_2_1,Resolutie dat in spolio violento,Resolutie dat in spolio violento,15821582,9.0,...,2PSO2K5,,,,,,,,11.0,0.0


In [12]:
cols={'skos:altLabel@en':'short','URI':'URI', 'skos:broader(lookupColumn="URI")':'broader', 'skos:prefLabel@nl':'dutch'}

voc = pd.read_csv('../categories/skos_catmpier_DuEnNL_flat.csv',delimiter=';', usecols=cols.keys(), skip_blank_lines=True)
voc.rename(columns=cols, inplace=True)
voc.dropna(subset=['URI'], inplace=True)
voc['broader']=voc['broader'].apply(lambda x: 'CatMPIeR:'+str(x))
voc.head()


Unnamed: 0,URI,dutch,short,broader
0,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4,Politieordonnanties,0Politia,CatMPIeR:nan
1,CatMPIeR:3cbd30ac-ddfb-11e9-9d36-2a2ae2dbcce4,Internationaal recht,0Other,CatMPIeR:nan
3,CatMPIeR:3cbd33ae-ddfb-11e9-9d36-2a2ae2dbcce4,Sociale orde en religie,1SO,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
4,CatMPIeR:3cbd352a-ddfb-11e9-9d36-2a2ae2dbcce4,Religieuze aangelegenheden,1SO1,CatMPIeR:3cbd33ae-ddfb-11e9-9d36-2a2ae2dbcce4
5,CatMPIeR:3cbd380e-ddfb-11e9-9d36-2a2ae2dbcce4,Bedelmonniken,1SO1A,CatMPIeR:3cbd352a-ddfb-11e9-9d36-2a2ae2dbcce4


In [13]:
with open('MPIER_vocabulary.tsv','w') as f:
    f.write('\n'.join([uri+'\t'+str(short)+'_'+str(dutch) for uri,short, dutch in voc[['URI','short','dutch']].values]))


In [15]:
keynames = [n for n in merged_3.columns if 'CODEMPIER' in n]

merged_3['is_testset']=np.random.choice(a=[False, True], size=(len(merged_3)),p=[.9,.1])
merged_3['split']=np.random.choice(a=range(10), size=(len(merged_3)))

count_terms_perlevel={k:Counter() for k in range(1,6)}

for i, row in merged_3.iterrows():
    identifier='p'+'-'.join([row['start-page_L'],row['start-region_L']])    
    
    keys = row[keynames].apply(lambda x: None if x.isspace() else x).dropna().values
    if len(keys)==0:
        print('No keys for', row.URI)
        continue
    keysPerLevel = {5:[],4:[],3:[],2:[],1:[]}
    # Determine level for each assigned key, and store in dict
    for key in keys:
        m = re.match('(0[A-Z]+)?([1-9][A-Z]+)?([0-9]+)?([A-Z]+)?([0-9]+)?', key)
        for level in [5,4,3,2,1]:
            if m.group(level) is not None: #find highest level for which there exists a group
                keysPerLevel[level].append(key)
                break
       
    # enter all keys plus their broader terms in a dataframe
    broaderterms = pd.DataFrame()
    for level in [5,4,3,2,1]:
        broaderterms=broaderterms.append(voc.loc[voc.short.isin(keysPerLevel[level])].add_suffix('_'+str(level)), sort=False)
        broaderterms = broaderterms.merge(voc.add_suffix('_'+str(level-1)), left_on='broader_'+str(level), how='left', right_on='URI_'+str(level-1), suffixes=('',str(level-1)))
    
    for split in range(10):
        kind = 'test' if row.split==split else 'train'

        for level in [1,2,3,4,5]:
            directory=os.path.join('annif-data','split_'+str(split), kind, 'level'+str(level))
            info = broaderterms[['URI_'+str(level),'dutch_'+str(level)]].drop_duplicates().dropna()
            for uri in info['URI_'+str(level)]: count_terms_perlevel[level][uri] +=1
            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(os.path.join(directory,identifier+'.txt'), 'w') as f:
                f.write(row['fulltext_L'])              
            with open(os.path.join(directory,identifier+'.key'),'w') as f:                
                f.write('\n'.join(['<'+uri+'>\t'+dutch for uri,dutch in info.values]))
        

In [16]:
merged_3.head()

Unnamed: 0,fulltext_L,keyword_L,page_L,rel-size_L,start-page_L,start-region_L,title_L,title-prefix_L,years_L,closest_i_L,...,CODEMPIER5_R,CODEMPIER6_R,CODEMPIER7_R,CODEMPIER8_R,CODEMPIER9_R,CODEMPIER10_R,closest_i_R,distance_R,is_testset,split
2,Resolutie tegens het slaen der Hegh-munten DEE...,True,11,1.6470588235294117,11,r_4_1,Resolutie tegens het slaen der Hegh-munten,Resolutie tegens het slaen der Hegh-munten,,0.0,...,,,,,,,2.0,0.0,False,3
3,Placaet waer by alle Vasallen en Leenplichtige...,True,11,1.6470588235294117,11,r_7_1,Placaet waer by alle Vasallen en,Placaet waer by alle Vasallen en,1581.0,1.0,...,,,,,,,3.0,10.0,False,2
4,Placaet op den prys van den gelde en tegens he...,True,12,1.5294117647058822,12,r_3_1,Placaet op den prys van den,Placaet op den prys van den,,2.0,...,,,,,,,-1.0,,False,3
5,"Placaet op ’t vangen der vyanden, en tegens he...",True,15,1.1,15,r_1_2,Placaet op ’t vangen der vyan-,Placaet op ’t vangen der vyan-,,3.0,...,2PSO2K4,,,,,,5.0,5.0,False,4
6,Placaet inhoudende confiscatie en verval der l...,True,16,1.5294117647058822,16,r_1_2,Placaet inhoudende confiscatie,Placaet inhoudende confiscatie,,4.0,...,2PSO4K4,2PSO4K8,2PSO2J,,,,6.0,1.0,False,5


In [17]:
count_terms_perlevel


{1: Counter({'CatMPIeR:3cbd30ac-ddfb-11e9-9d36-2a2ae2dbcce4': 40,
          'CatMPIeR:3cbd33ae-ddfb-11e9-9d36-2a2ae2dbcce4': 350,
          'CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4': 4080}),
 2: Counter({'CatMPIeR:3cbd33ae-ddfb-11e9-9d36-2a2ae2dbcce4': 1860,
          'CatMPIeR:3cbd352a-ddfb-11e9-9d36-2a2ae2dbcce4': 350,
          'CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4': 10,
          'CatMPIeR:50f309ca-ddfb-11e9-8a34-2a2ae2dbcce4': 1550,
          'CatMPIeR:50f4a1b8-ddfb-11e9-8a34-2a2ae2dbcce4': 130,
          'CatMPIeR:d5706b52-ddfb-11e9-8a34-2a2ae2dbcce4': 770,
          'CatMPIeR:e8b0ca22-ddfb-11e9-8a34-2a2ae2dbcce4': 560}),
 3: Counter({'CatMPIeR:3cbd352a-ddfb-11e9-9d36-2a2ae2dbcce4': 760,
          'CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4': 10,
          'CatMPIeR:3cbdfbae-ddfb-11e9-9d36-2a2ae2dbcce4': 1150,
          'CatMPIeR:3cbee122-ddfb-11e9-9d36-2a2ae2dbcce4': 170,
          'CatMPIeR:3cbf698a-ddfb-11e9-9d36-2a2ae2dbcce4': 10,
          'CatMPIeR:3cbfae7

In [21]:
voc.loc[voc.broader == 'CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4']

Unnamed: 0,URI,dutch,short,broader
3,CatMPIeR:3cbd33ae-ddfb-11e9-9d36-2a2ae2dbcce4,Sociale orde en religie,1SO,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
466,CatMPIeR:50f309ca-ddfb-11e9-8a34-2a2ae2dbcce4,Publieke veiligheid en orde,2PSO,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
708,CatMPIeR:50f49a88-ddfb-11e9-8a34-2a2ae2dbcce4,Lagere rechtbank,2PSO4K6,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
712,CatMPIeR:50f4a1b8-ddfb-11e9-8a34-2a2ae2dbcce4,Sociale diensten. gezondheidszorg. Onderwijs. ...,3PRH,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
983,CatMPIeR:d5706b52-ddfb-11e9-8a34-2a2ae2dbcce4,Economische orde. Werk- en professionele voors...,4ESP,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
1655,CatMPIeR:e8b0ca22-ddfb-11e9-8a34-2a2ae2dbcce4,Bodem-inrichting,5LDP,CatMPIeR:3cbdb69e-ddfb-11e9-9d36-2a2ae2dbcce4
