## Evaluating Index Structure Extraction

This notebook creates spreadsheets for correcting extract structure with the goal of creating a ground truth dataset to evaluate the automatic structure extraction.



In [1]:
# This reload library is just used for developing the REPUBLIC hOCR parser 
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2


# This is needed to add the repo dir to the path so jupyter
# can load the republic modules directly from the notebooks
import os
import sys
repo_name = 'republic-project'
repo_dir = os.path.split(os.getcwd())[0].split(repo_name)[0] + repo_name
print("adding project dir to path:", repo_dir)
if repo_dir not in sys.path:
    sys.path.append(repo_dir)



adding project dir to path: /Users/marijnkoolen/Code/Huygens/republic-project


In [38]:
import pandas as pd

latest_file = '../../data/indices/index_entries-3806-3864-latest.csv.gz'
df = pd.read_csv(latest_file, compression='gzip', sep='\t', index_col=False)

print(df.shape)
df['scan_id'] = df.scan_id.str.replace("['", '', regex=False)
df['scan_id'] = df.scan_id.str.replace("']", '', regex=False)
df['scan_id'] = df.scan_id.str.split("', '")
df = df.explode('scan_id')
print(df.shape)
df.head(13)

(225182, 9)
(229453, 9)


Unnamed: 0,main_term,sub_lemma,date_locator,date_locator_day,date_locator_month,page_locator,inventory_num,scan_id,page_id
0,Aalbregt.,VX Fygeslagen sijn Rey VW versoek om WES uit d...,16 July,16.0,Jul,419,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
1,Aardenburg.,Sie op Vlaanderen.,,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
2,-Aarle-Rixtel.,"Sie op 's Hertogenbosch en Meyerye,.",,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
3,d' AblaingvanGies,Requeste van de Heer d'Ahlaing van Gies,,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
4,Jenburg.,"senburg, verso-kende dat het Jagtgerigt van Vl...",4 Mey,4.0,Mey,266,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
5,Jenburg.,Eerigt en Commissarissen gaande na Vlaanderen ...,22 dito,22.0,Mey,305,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
6,Jenburg.,Rapport van Commissarissen: Aan her Jagtgerigt...,10 July,10.0,Jul,407,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
7,d'AblaingwvanGiessenburg.,Jagtgerigt om berigt-l Berigt dien aangaande: ...,10 July 13 Aug. 3 Maart,,,407 47. 174,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
8,d'AblaingwvanGiessenburg.,Verbaal van die Commissie ter Griffie geseponeert,9 July,9.0,Jul,405,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
9,Ackersdyk.,Geapprobeert de collatie van de Beneficienm va...,12 Mey,12.0,Mey,283,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007


In [39]:
df.scan_id.nunique()

4289

In [41]:
df.scan_id.value_counts()

NL-HaNA_1.01.02_3859_0337    160
NL-HaNA_1.01.02_3806_0047    147
NL-HaNA_1.01.02_3810_0022    140
NL-HaNA_1.01.02_3812_0067    137
NL-HaNA_1.01.02_3813_0066    137
                            ... 
NL-HaNA_1.01.02_3853_0087      6
NL-HaNA_1.01.02_3862_0214      6
NL-HaNA_1.01.02_3820_0079      4
NL-HaNA_1.01.02_3834_0105      3
NL-HaNA_1.01.02_3813_0089      2
Name: scan_id, Length: 4289, dtype: int64

### Sampling random scans for creating ground truth

In [47]:
sample_scans = set(df.scan_id.sample(50))
sample_scans

{'NL-HaNA_1.01.02_3806_0038',
 'NL-HaNA_1.01.02_3811_0026',
 'NL-HaNA_1.01.02_3812_0021',
 'NL-HaNA_1.01.02_3812_0044',
 'NL-HaNA_1.01.02_3813_0009',
 'NL-HaNA_1.01.02_3813_0082',
 'NL-HaNA_1.01.02_3815_0009',
 'NL-HaNA_1.01.02_3815_0016',
 'NL-HaNA_1.01.02_3815_0053',
 'NL-HaNA_1.01.02_3815_0054',
 'NL-HaNA_1.01.02_3815_0083',
 'NL-HaNA_1.01.02_3817_0068',
 'NL-HaNA_1.01.02_3819_0011',
 'NL-HaNA_1.01.02_3820_0014',
 'NL-HaNA_1.01.02_3820_0032',
 'NL-HaNA_1.01.02_3821_0062',
 'NL-HaNA_1.01.02_3822_0073',
 'NL-HaNA_1.01.02_3826_0065',
 'NL-HaNA_1.01.02_3827_0082',
 'NL-HaNA_1.01.02_3827_0083',
 'NL-HaNA_1.01.02_3827_0091',
 'NL-HaNA_1.01.02_3828_0092',
 'NL-HaNA_1.01.02_3830_0069',
 'NL-HaNA_1.01.02_3830_0079',
 'NL-HaNA_1.01.02_3831_0056',
 'NL-HaNA_1.01.02_3832_0021',
 'NL-HaNA_1.01.02_3832_0088',
 'NL-HaNA_1.01.02_3833_0039',
 'NL-HaNA_1.01.02_3834_0076',
 'NL-HaNA_1.01.02_3834_0080',
 'NL-HaNA_1.01.02_3836_0023',
 'NL-HaNA_1.01.02_3836_0035',
 'NL-HaNA_1.01.02_3839_0373',
 'NL-HaNA_

In [61]:
sample_scans = {
    'NL-HaNA_1.01.02_3806_0038',
    'NL-HaNA_1.01.02_3811_0026',
    'NL-HaNA_1.01.02_3812_0021',
    'NL-HaNA_1.01.02_3812_0044',
    'NL-HaNA_1.01.02_3813_0009',
    'NL-HaNA_1.01.02_3813_0082',
    'NL-HaNA_1.01.02_3815_0009',
    'NL-HaNA_1.01.02_3815_0016',
    'NL-HaNA_1.01.02_3815_0053',
    'NL-HaNA_1.01.02_3815_0054',
    'NL-HaNA_1.01.02_3815_0083',
    'NL-HaNA_1.01.02_3817_0068',
    'NL-HaNA_1.01.02_3819_0011',
    'NL-HaNA_1.01.02_3820_0014',
    'NL-HaNA_1.01.02_3820_0032',
    'NL-HaNA_1.01.02_3821_0062',
    'NL-HaNA_1.01.02_3822_0073',
    'NL-HaNA_1.01.02_3826_0065',
    'NL-HaNA_1.01.02_3827_0082',
    'NL-HaNA_1.01.02_3827_0083',
    'NL-HaNA_1.01.02_3827_0091',
    'NL-HaNA_1.01.02_3828_0092',
    'NL-HaNA_1.01.02_3830_0069',
    'NL-HaNA_1.01.02_3830_0079',
    'NL-HaNA_1.01.02_3831_0056',
    'NL-HaNA_1.01.02_3832_0021',
    'NL-HaNA_1.01.02_3832_0088',
    'NL-HaNA_1.01.02_3833_0039',
    'NL-HaNA_1.01.02_3834_0076',
    'NL-HaNA_1.01.02_3834_0080',
    'NL-HaNA_1.01.02_3836_0023',
    'NL-HaNA_1.01.02_3836_0035',
    'NL-HaNA_1.01.02_3839_0373',
    'NL-HaNA_1.01.02_3844_0046',
    'NL-HaNA_1.01.02_3844_0062',
    'NL-HaNA_1.01.02_3844_0097',
    'NL-HaNA_1.01.02_3847_0436',
    'NL-HaNA_1.01.02_3848_0109',
    'NL-HaNA_1.01.02_3851_0280',
    'NL-HaNA_1.01.02_3851_0320',
    'NL-HaNA_1.01.02_3851_0327',
    'NL-HaNA_1.01.02_3852_0012',
    'NL-HaNA_1.01.02_3852_0078',
    'NL-HaNA_1.01.02_3853_0048',
    'NL-HaNA_1.01.02_3854_0016',
    'NL-HaNA_1.01.02_3854_0041',
    'NL-HaNA_1.01.02_3856_0079',
    'NL-HaNA_1.01.02_3856_0096',
    'NL-HaNA_1.01.02_3861_0355',
    'NL-HaNA_1.01.02_3862_0011'
}

In [49]:
sample_df = df[df.scan_id.isin(sample_scans)]
sample_df

Unnamed: 0,main_term,sub_lemma,date_locator,date_locator_day,date_locator_month,page_locator,inventory_num,scan_id,page_id
2076,vanUmmeren.,Staate om advis Advis dien aangaande en Resolu...,25 Aug. 10 Dec. 12 Juny,,,495 751 352,3806,NL-HaNA_1.01.02_3806_0038,"['NL-HaNA_1.01.02_3806_0037', 'NL-HaNA_1.01.02..."
2077,vanUmmeren.,Item,14 July,14,Jul,414,3806,NL-HaNA_1.01.02_3806_0038,NL-HaNA_1.01.02_3806_0038
2078,vanUmmeren.,Berigt: De Generaliteits Reekenkamerte adviseeren,26 Oct.,26,Oct,654,3806,NL-HaNA_1.01.02_3806_0038,NL-HaNA_1.01.02_3806_0038
2079,vanUmmeren.,Advis en afgeweesen(.,17 Dee.,17,Dee,773),3806,NL-HaNA_1.01.02_3806_0038,NL-HaNA_1.01.02_3806_0038
2080,Oorschot.,Sie op 's Hertogenbosch en Meyerye.,,,,,3806,NL-HaNA_1.01.02_3806_0038,NL-HaNA_1.01.02_3806_0038
...,...,...,...,...,...,...,...,...,...
213797,Admiraliteit opde Maaze.,"Missive van dezelve, verzoekende haar Hoog Mog...",dito,dito,dito,188,3862,NL-HaNA_1.01.02_3862_0011,NL-HaNA_1.01.02_3862_0011
213798,Admiraliteit opde Maaze.,"Missive van dezelve, kennis gevende van een op...",dito,dito,dito,188,3862,NL-HaNA_1.01.02_3862_0011,NL-HaNA_1.01.02_3862_0011
213799,Admiraliteit opde Maaze.,"Geauthoriseert om aan het Gezelfchap, be staan...",27 dito,27,dito,228,3862,NL-HaNA_1.01.02_3862_0011,NL-HaNA_1.01.02_3862_0011
213800,Admira.liteit te,Advis van dit Collegie op haar Hoog Mog.,,,,,3862,NL-HaNA_1.01.02_3862_0011,NL-HaNA_1.01.02_3862_0011


In [31]:
df['scan_id'] = df.scan_id.str.split(', ').head(13)

In [32]:
df.explode('scan_id').head(13)

Unnamed: 0,main_term,sub_lemma,date_locator,date_locator_day,date_locator_month,page_locator,inventory_num,scan_id,page_id
0,Aalbregt.,VX Fygeslagen sijn Rey VW versoek om WES uit d...,16 July,16.0,Jul,419,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
1,Aardenburg.,Sie op Vlaanderen.,,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
2,-Aarle-Rixtel.,"Sie op 's Hertogenbosch en Meyerye,.",,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
3,d' AblaingvanGies,Requeste van de Heer d'Ahlaing van Gies,,,,,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
4,Jenburg.,"senburg, verso-kende dat het Jagtgerigt van Vl...",4 Mey,4.0,Mey,266,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
5,Jenburg.,Eerigt en Commissarissen gaande na Vlaanderen ...,22 dito,22.0,Mey,305,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
6,Jenburg.,Rapport van Commissarissen: Aan her Jagtgerigt...,10 July,10.0,Jul,407,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
7,d'AblaingwvanGiessenburg.,Jagtgerigt om berigt-l Berigt dien aangaande: ...,10 July 13 Aug. 3 Maart,,,407 47. 174,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
8,d'AblaingwvanGiessenburg.,Verbaal van die Commissie ter Griffie geseponeert,9 July,9.0,Jul,405,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007
9,Ackersdyk.,Geapprobeert de collatie van de Beneficienm va...,12 Mey,12.0,Mey,283,3806,NL-HaNA_1.01.02_3806_0007,NL-HaNA_1.01.02_3806_0007


In [60]:
df.columns

Index(['main_term', 'sub_lemma', 'date_locator', 'date_locator_day',
       'date_locator_month', 'page_locator', 'inventory_num', 'scan_id',
       'page_id'],
      dtype='object')

In [76]:
import datetime

from openpyxl import Workbook


cols = [
    'main_term', 'sub_lemma', 'date_locator', 'date_locator_day',
    'date_locator_month', 'page_locator', 'inventory_num', 'scan_id',
    #'page_id'
]

wb = Workbook()
ws = wb.active

today = datetime.date.today().isoformat()

gt_file = f'../../ground_truth/indices/index_pages-1750-1796-sample-{today}.xlsx'

ws.append(cols)

prev_scan_id = None
prev_main_term = None
prev_inv_num = None

row_num = 1
add_url = True

for row_index, df_row in sample_df.iterrows():
    if prev_scan_id is not None and df_row['scan_id'] != prev_scan_id:
        print(df_row['scan_id'])
        ws.append([])
        row_num += 1
    scan_url = f"https://images.diginfra.net/iiif/NL-HaNA_1.01.02/{df_row['inventory_num']}/{df_row['scan_id']}.jpg/full/full/0/default.jpg"
    row_num += 1
    row = [df_row[col] for col in cols]
    row[cols.index('main_term')] = df_row['main_term'] if prev_main_term != df_row['main_term'] else ''
    row[cols.index('inventory_num')] = df_row['inventory_num'] if prev_inv_num != df_row['inventory_num'] else ''
    row[cols.index('scan_id')] = df_row['scan_id'] if prev_scan_id is None or df_row['scan_id'] != prev_scan_id else ''
    ws.append(row)
    if row[-1] != '':
        ws[f'H{row_num}'].hyperlink = scan_url
    print(row)
    prev_scan_id = df_row['scan_id']
    prev_main_term = df_row['main_term']
    prev_inv_num = df_row['inventory_num']

    
wb.save(gt_file)

['vanUmmeren.', 'Staate om advis Advis dien aangaande en Resolutie ReER Requeste om approbatie van collatie: Den Rentmeester de Kempenaer te berigten', '25 Aug. 10 Dec. 12 Juny', nan, nan, '495 751 352', 3806, 'NL-HaNA_1.01.02_3806_0038']
['', 'Item', '14 July', '14', 'Jul', '414', '', '']
['', 'Berigt: De Generaliteits Reekenkamerte adviseeren', '26 Oct.', '26', 'Oct', '654', '', '']
['', 'Advis en afgeweesen(.', '17 Dee.', '17', 'Dee', '773)', '', '']
['Oorschot.', "Sie op 's Hertogenbosch en Meyerye.", nan, nan, nan, nan, '', '']
['nVort.', "Sie op 's Hertogenbosch en MMeyerye.", nan, nan, nan, nan, '', '']
['Vost.', 'Sie op Overmaaze.', nan, nan, nan, nan, '', '']
['Uosen.', 'Sie op Uverquartier van Gelderland.', nan, nan, nan, nan, '', '']
['Oestenryksche Nederlanden.', 'Misfive van den Heer van Haren houdende advertentie', 'H Jan.', 'H', 'Jan', 'Is5', '', '']
['', 'Item', '28 dito', '28', 'Jan', '57', '', '']
['', 'ltem', '30 dito', '30', 'Jan', '61', '', '']
['', 'ltem', '6 Maar

['', 'Geaccordeert', '9 Aug.', '9', 'Aug', '647', '', '']
['vandePerre.', 'Weegens Zeeland ter Generaliteit gecommitteert', '22 Maart', '22', 'Maa', '216', '', '']
['Fest.', 'ole op besmettelyke Jiekte.', nan, nan, nan, nan, '', '']
['Petitien.', 'Consent van Zeeland inl de Equipage van tweel en dertig Scheepen', '7 Jan.', '7', 'Jan', '11', '', '']
['', 'Utrecht versogt hun Confent te willen inbrengen', '6 April', '6', 'Apr', '258', '', '']
['', 'Consent van Uirecht daar in', '26 dito', '26', 'Apr', '315', '', '']
['', 'Consent van Vriesland in de generaale Petitie en Staaten van Oorlog', '19 Maart', '19', 'Maa', '213', '', '']
['', 'Item van Overyssel', '30 dito', '30', 'Maa', '241', '', '']
['', 'van Holland', '6 April', '6', 'Apr', '256', '', '']
['', 'van Utrecht', '26 dito', '26', 'Apr', '314', '', '']
['', 'van Gelderland', '3 Mey', '3', 'Mey', '339', '', '']
['', 'van Zeeland', 'dito', '3', 'Mey', '341', '', '']
['', 'van Stad en Lan de', '4 dito', '4', 'Mey', '356', '', '']
[''