In [1]:
%load_ext autoreload
%autoreload 2
# for development



In [2]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict

In [3]:
# as recommended in the pandas documentation
# pd.options.mode.copy_on_write = True

In [4]:
from data_definitions import *
# coll, common_tables, ids, nids, content_cols, cols, uniq_per_table, tblregister

In [5]:
from help_functions import * 
# sup, deldupids, my_conv, try_padding, makedate_from_givendate, get_unique_lower, case_insensitive_unique_list, replace_ids


## steps

1. get old tables from mysql database (this could be substituted with filesystem). Note: there are different databases for each period
2. which tables are common to different databases (they do not all contain the same tables)
3. which id columns and which content columns do the tables contain (once again, not all original databases are exactly the same)
4. merge common tables and deduplicate references 
5. dump this intermediate result to disk
6. transformations of the persoon table (checking ids, deduplicate, normalizing dates)
7. transformations of the aanstellingen table (same as step 6)
8. id reference transformations



In [6]:
# step 2
transposed_graph = defaultdict(list)
for node, neighbours in coll.items():
    for neighbour in neighbours:
        transposed_graph[neighbour].append(node)


In [7]:
common_tables

['AcademischeTitel',
 'AdellijkeTitel',
 'aliassen',
 'Bron',
 'BronRegentDetails',
 'College',
 'Functie',
 'FunctieBovenLokaal',
 'FunctieLokaal',
 'lokaal',
 'provinciaal',
 'Regent',
 'regionaal',
 'stand',
 'BovenLokaalCollegeRegentDetails']

In [8]:
import json
with open('connection.json','r') as infl:
    con = json.load(infl)
    
connection_string = con['raa_old']
toedirs = con['toe_dirs']
batfralokatie = con['batfralokatie']

In [9]:
table_dict = defaultdict(dict)
for table in common_tables:
    for periode in transposed_graph[table]:
        periode = periode.strip().replace(' ', '_')
        tname = '_'.join([periode, table]).strip()
        print(f"getting {tname}")
        dftable = pd.read_sql_table(con=connection_string, table_name=tname)
        dftable = dftable.rename(columns={c:c.lower() for c in dftable.columns})
        dftable.reset_index(inplace=True)
        if table.lower() in replacements:
            targettable = replacements[table.lower()]
        else:
            targettable = table.lower()
        print(f"adding {table.lower()} to {periode} as {targettable}")
        table_dict[periode][targettable] = dftable

getting batfra_AcademischeTitel
adding academischetitel to batfra as academischetitel
getting negentiende_eeuw_AcademischeTitel
adding academischetitel to negentiende_eeuw as academischetitel
getting me_AcademischeTitel
adding academischetitel to me as academischetitel
getting divperioden_AcademischeTitel
adding academischetitel to divperioden as academischetitel
getting republiek_AcademischeTitel
adding academischetitel to republiek as academischetitel
getting batfra_AdellijkeTitel
adding adellijketitel to batfra as adellijketitel
getting negentiende_eeuw_AdellijkeTitel
adding adellijketitel to negentiende_eeuw as adellijketitel
getting me_AdellijkeTitel
adding adellijketitel to me as adellijketitel
getting divperioden_AdellijkeTitel
adding adellijketitel to divperioden as adellijketitel
getting republiek_AdellijkeTitel
adding adellijketitel to republiek as adellijketitel
getting batfra_aliassen
adding aliassen to batfra as aliassen
getting negentiende_eeuw_aliassen
adding aliassen to

In [10]:
common_tables = [replacements.get(t.lower()) or t.lower() for t in common_tables]

In [11]:
for tbl in common_tables:
    for key in table_dict.keys():
        if tbl in table_dict[key].keys():
            addtbl = table_dict[key][tbl]
        else:
            print (f"no {tbl} in {key}")

no functiebovenlokaal in negentiende_eeuw


In [12]:
# step 4: join tables
# joined means merged as in making one table from different tables
# we first change the ids and reference ids to old ids and add the table origin to the id

idmappings = defaultdict(list)
raw_joined_tables = {}
for tbl in common_tables:
    for key in table_dict.keys():
        try:
            if tbl in table_dict[key].keys():
                addtbl = table_dict[key][tbl]
                assert len(addtbl) > 0
            else:
                print (f"no {tbl} in {key}")
            try:
                for idnr in tblregister[tbl]['oldids']:
                    idnr = idnr.lower()
                    addtbl[f'old_{idnr}'] = addtbl[idnr].astype(pd.Int64Dtype()).apply(lambda x: f"{key}_{x}")
                    idmappings[f'old_{idnr}'].append(tbl)
            except (KeyError, TypeError):
                print (f"no {idnr} on {tbl} in {key}")
            if tbl in replacements:
                ntbl = replacements[tbl]
            else:
                ntbl = tbl
            if tbl in raw_joined_tables.keys():
                print(f"joining {key}, {tbl}")
                raw_joined_tables[ntbl] = pd.concat([raw_joined_tables[tbl],addtbl],ignore_index=True)
            else:
                raw_joined_tables[ntbl] = addtbl
        except AssertionError:
            print(f'skipping {key}-{tbl} because it contains no values')
            # joined_tables[tbl].reset_index(inplace=True, drop=True)

joining negentiende_eeuw, academischetitel
joining me, academischetitel
joining divperioden, academischetitel
joining republiek, academischetitel
joining negentiende_eeuw, adellijketitel
joining me, adellijketitel
joining divperioden, adellijketitel
joining republiek, adellijketitel
joining negentiende_eeuw, aliassen
joining me, aliassen
joining divperioden, aliassen
joining republiek, aliassen
joining negentiende_eeuw, bron
joining me, bron
joining divperioden, bron
joining republiek, bron
joining negentiende_eeuw, bronregentdetails
joining me, bronregentdetails
joining divperioden, bronregentdetails
joining republiek, bronregentdetails
joining negentiende_eeuw, college
joining me, college
joining divperioden, college
joining republiek, college
joining negentiende_eeuw, functie
joining me, functie
joining divperioden, functie
joining republiek, functie
skipping batfra-functiebovenlokaal because it contains no values
no functiebovenlokaal in negentiende_eeuw
skipping me-functiebovenlok

In [13]:
nids =  {k.lower():[x.lower() for x in v.get('oldids') or []] for k,v in tblregister.items()}

In [14]:
# step 4: deduplicate common tables

joined_tables = {}
for tablename in common_tables:
    if tablename in replacements:
       tablename = replacements[tablename]
    tbl = raw_joined_tables[tablename]
    val_column = tblregister[tablename].get('uniq') or ''
    id_cols = nids[tablename]
    try:
        idc = [i for i in id_cols if i.find(val_column)>-1][0]
    except (IndexError, KeyError):
        # a bit of a lame fallback for the aliassen table
        idc = id_cols[0]
    oldidc = f'old_{idc}'
    if val_column in tbl.columns and tablename not in list(replacements.keys())+['aliassen','bronregentdetails']:
        dedup = deldupids(tbl, val_column, oldidc)
        r = dedup
    else:
        r = tbl
    joined_tables[tablename] = r
    # newid_mappings[tablename] = make_idmapping(r, val_column, oldidc)
    # else:
    #     print('fout', tablename, tbl.columns, oldidc)
    

In [15]:
# step 4a: add aanstelling and person tables to joined_tables 
# and we remove old tables from the dictionary, just to be sure

joined_tables['aanstelling'] = raw_joined_tables.pop('aanstelling')
joined_tables['persoon'] = raw_joined_tables.pop('persoon')

In [16]:
# step 5: dump the joined tables to disk
# note these tables are still not cleaned up

ids = nids # to keep things confused

try:
    os.makedirs(outdir)
except FileExistsError:
    pass
for key in joined_tables.keys():
    dfout = joined_tables[key]
    dfout.to_csv(os.path.join(outdir, key + '.csv'))
    

In [17]:
# step 6: transformations of the persoon table
# we still need to set a new id on regent 
persoon = joined_tables['persoon']
persoon.reset_index(inplace=True, drop=True)
persoon["persoon_id"] = persoon.index + 1 


In [18]:
# step 6: check if ids are duplicated
persoon.old_idregent.value_counts().sort_values(ascending=False)

old_idregent
batfra_52         1
republiek_1348    1
batfra_1          1
batfra_2          1
batfra_3          1
                 ..
republiek_7228    1
republiek_6584    1
republiek_7184    1
republiek_3719    1
republiek_6166    1
Name: count, Length: 15775, dtype: int64

In [19]:
persoon.persoon_id.value_counts().sort_values(ascending=False)

persoon_id
39       1
15775    1
1        1
2        1
3        1
        ..
15739    1
15740    1
15741    1
15742    1
15743    1
Name: count, Length: 15775, dtype: int64

In [20]:
joined_tables['aanstelling'].columns

Index(['index', 'id', 'periode', 'idregent', 'idfunctie', 'idcollege',
       'begindag', 'beginmaand', 'beginjaar', 'einddag', 'eindmaand',
       'eindjaar', 'vertegenwoordigend', 'provinciaal', 'opmerkingen', 'stand',
       'lokaal', 'regio', 'old_id', 'old_idregent', 'old_idfunctie',
       'old_idcollege', 'old_lokaal', 'old_provinciaal', 'old_regio',
       'old_stand', 'old_vertegenwoordigend'],
      dtype='object')

In [21]:
cols = ['geboortejaar','geslachtsnaam', 'heerlijkheid',
       'overlijdensjaar', 'overlijdensmaand',
       'tussenvoegsel','voornaam']

In [22]:
# step 6: remove duplicates
duplicates = persoon.loc[(persoon[cols].duplicated(keep=False))].sort_values('geslachtsnaam')

In [23]:
# step 6 remove duplicates
xx = duplicates.merge(duplicates, left_on=cols,right_on=cols, suffixes=('l', 's'))

In [24]:
# step 6: there are duplicate ids that have to be cleaned up
keypairs = xx.loc[xx.idregentl != xx.idregents][['idregentl' ,'idregents']]
dup = persoon.loc[persoon.duplicated(cols)]
zz = keypairs.loc[keypairs.idregentl.isin(dup.idregent)]


In [25]:
# step 6: continue cleaning up
persoon[cols].drop_duplicates(inplace=True)
persoon.reset_index(drop=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  persoon[cols].drop_duplicates(inplace=True)


In [26]:
# step 6: more person table cleaning up 
# (the NAs appear as NA in the resulting web representation)


persoon.heerlijkheid.fillna('', inplace=True)
persoon.heerlijkheid2.fillna('', inplace=True)
persoon.heerlijkheid = persoon.apply(lambda x: x.heerlijkheid + ' en ' + x.heerlijkheid2  if x.heerlijkheid2 !='' else x.heerlijkheid, axis=1)

# step 6: normalize dates
# we replace overlijdensjaar with ? as last digit with 9
# that is the only real edit, all other non-standard date elements are replaced
persoon.overlijdensjaar.replace('?','9', inplace=True)

# step 6: more normalizing dates
for j in ["geboortedag", "geboortemaand", "geboortejaar", 
          "overlijdensdag", "overlijdensmaand", "overlijdensjaar"]:
    persoon[j] = pd.to_numeric(persoon[j], errors="coerce")

# step 6: there is a lot of dates that need to be normalized
persoon["geboortedatum_als_bekend"] = persoon.apply(lambda x: '-'.join([try_padding(d) for d in [
                        x.geboortedag, x.geboortemaand, x.geboortejaar ] if d and not np.isnan(d)]),
                        axis=1)

# the d==d is a workaround for all None and NaN, as they are the only items that have no identity
persoon["overlijdensdatum_als_bekend"] = persoon.apply(lambda x: '-'.join([try_padding(d) for d in [
                        x.overlijdensdag, x.overlijdensmaand, x.overlijdensjaar ] if d and d==d]),
                        axis=1)


# step 6: date normalizing
persoon["geboortedatum"] = persoon.geboortedatum_als_bekend.apply(lambda x: makedate_from_givendate(x,start=True))
persoon["overlijdensdatum"] = persoon.overlijdensdatum_als_bekend.apply(lambda x: makedate_from_givendate(x,start=False))



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  persoon.heerlijkheid.fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  persoon.heerlijkheid2.fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [27]:
# step 6 NaN filling for some fields
persoon.tussenvoegsel.fillna('', inplace=True)
persoon.geslachtsnaam.fillna('', inplace=True) # gross
persoon['searchable'] = persoon[["tussenvoegsel", "geslachtsnaam"]].apply(lambda x: ' '.join(x), axis=1).str.strip()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  persoon.tussenvoegsel.fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  persoon.geslachtsnaam.fillna('', inplace=True) # gross


In [28]:
# step 6: test for birth dates
persoon[['geboortedatum_als_bekend', 'geboortedatum']].sample(25)

Unnamed: 0,geboortedatum_als_bekend,geboortedatum
884,1758,1758-01-01
13007,,
3398,11-03-1789,1789-03-11
6362,,
4157,04-12-1754,1754-12-04
11553,,
3120,1759,1759-01-01
9730,1632,1632-01-01
4440,,
6284,17-04-1807,1807-04-17


In [29]:
# step 6: we add this to the joined_tables
persoon.reset_index(drop=True, inplace=True)
joined_tables['persoon'] = persoon
#regent.searchable

In [30]:
# step 7: transformations of aanstelling table 
aanstelling = joined_tables['aanstelling']

In [31]:
# step 7: cleaning up strange dates with technical brute force

# we tried to do this with more heuristics, but in the end we just edit out faulty stuff as we do not know
# what to make of it anyway
for d in ['begindag', 'beginmaand', 'beginjaar', 'einddag','eindmaand','eindjaar']:
    try:
        #blcrd[d] = blcrd[d].str.replace(r'[^0-9]+','',regex=True)
        #blcrd[d] = blcrd[d].str.replace(r'[\)\<\-\?\/\>]|Jaco|Duve|Leid|Clee|Will|Krus','',regex=True)
        aanstelling[d] = pd.to_numeric(aanstelling[d], errors='coerce') # this automatically kicks out mistakes in dates
    except AttributeError:
        pass

In [32]:
# step 7: another round of date mangling for aanstelling table
aanstelling["van_als_bekend"] = aanstelling.apply(lambda x: '-'.join([try_padding(d) for d in [
                        x.begindag, x.beginmaand, x.beginjaar ] if d and not np.isnan(d)]),
                        axis=1)

aanstelling["tot_als_bekend"] = aanstelling.apply(lambda x: '-'.join([try_padding(d) for d in [
                        x.einddag, x.eindmaand, x.eindjaar] if d and d==d]),
                        axis=1)

In [33]:
# step 7: renaming
for item in ['van', 'tot']:
    if item == 'tot':
        start = False
    else:
        start=True
    aanstelling[item] = aanstelling[f"{item}_als_bekend"].apply(lambda x: makedate_from_givendate(x,start=start))
    aanstelling[item] = aanstelling[item].astype(str)
   

In [34]:
# step 7: add to joined_tables 

joined_tables['aanstelling'] = aanstelling

In [35]:

college = joined_tables['college']
college['toelichting'] = ''

In [36]:
from bs4 import BeautifulSoup
clgs = {}
for toedir in con['toe_dirs']:
    fllst = os.listdir(toedir)
    for item in fllst:
        nm = os.path.splitext(item)[0]
        knm = nm.split('(')[0].strip()
        # print(nm)
        with open(os.path.join(toedir,item),encoding='latin1') as infl:
            htmlsource = BeautifulSoup(infl, 'html.parser')
            toelichting = htmlsource('body')[0].renderContents().strip()
            clgs[nm] = toelichting
            clgs[knm] = toelichting # not nice, but some names appear with and others without years

In [37]:
college['toelichting'] = college.college.map(clgs)

In [38]:
college.loc[college.toelichting.notna()]

Unnamed: 0,college_id,college,old_idcollege,toelichting
4,4,Admiraliteit in Friesland (1596-1795),"[divperioden_6, republiek_34]","b""<h3>Institutionele toelichting Admiraliteit ..."
5,5,Admiraliteit in Zeeland (1584-1795),[republiek_32],b'<h3>Institutionele toelichting Admiraliteit ...
6,6,Admiraliteit in het Noorderkwartier (1589-1795),"[divperioden_48, republiek_33]","b""<h3>Institutionele toelichting Admiraliteit ..."
7,7,Admiraliteit op de Maze (1586-1795),"[divperioden_45, republiek_30]",b'<h3>Institutionele toelichting Admiraliteit ...
8,8,Admiraliteit te Amsterdam (1586-1795),"[divperioden_7, republiek_31]","b""<h3>Institutionele toelichting Admiraliteit ..."
...,...,...,...,...
454,454,Staten van Holland (1572-1795),[republiek_26],b'<h3>Institutionele toelichting Staten van Ho...
457,457,Staten van Zeeland (1578-1795),[republiek_49],b'<h3>Institutionele toelichting Staten van Ze...
458,458,Staten-Generaal (1814-1815),[negentiende_eeuw_3],"b""<h3>Institutionele toelichting Staten-Genera..."
464,464,Tweede Kamer der Staten-Generaal (1815-1861),[negentiende_eeuw_4],"b""<h3>Institutionele toelichting Tweede Kamer ..."


In [39]:
batfratijd = pd.read_csv(con['batfralokatie'])

In [40]:
from string import Template
templ = Template('<br>https://resources.huygens.knaw.nl/bataafsfransetijd/Instellingen/$nid')

In [41]:
tdict = batfratijd[['insNaam','insID']].to_dict(orient='records')

In [42]:
inm = college.college.to_list()


In [43]:
tcollege = college.merge(batfratijd[['insNaam','insID']], left_on='college', right_on='insNaam', how='left')
tcollege.toelichting.fillna('',inplace=True)
tcollege['link'] = np.where(tcollege.insID.notna(), templ.substitute(nid=tcollege.insID.astype('str')), '')
tcollege['link'].str.replace('\.[0-9]+','', regex=True)
tcollege['link'].fillna('',inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tcollege.toelichting.fillna('',inplace=True)


In [44]:
tcollege.toelichting = tcollege.toelichting.astype(str) + tcollege.link
joined_tables['college'] = tcollege


## reference replacement

- gegeven een tbl met daarin titel, titelid, oude titel willen we hebben unieke titel, unieke id, referentie naar oude titel
- dus idealiter oude titel  -> nieuwe id, nieuwe titel
- daarom moeten we  unieke lijst van titels hebben
- daaruit genereren we een unieke lijst van titels + id
- we voegen die dan toe aan de tabel waarin die titel wordt gerefereerd

In [45]:
# step 8: 
# we have a number of tables that are referenced

idmappings = {i:list(set(idmappings[i])) for i in idmappings.keys()}
#idmappings

In [46]:
idmappings

{'old_idacademischetitel': ['academischetitel', 'persoon'],
 'old_idadellijketitel': ['persoon', 'adellijketitel'],
 'old_idpersoon': ['aliassen'],
 'old_idbron': ['bronregentdetails', 'bron'],
 'old_idregent': ['aanstelling', 'bronregentdetails', 'persoon'],
 'old_idcollege': ['aanstelling', 'college'],
 'old_id': ['aanstelling', 'functie', 'college'],
 'old_idfunctie': ['aanstelling', 'functie'],
 'old_id functiebovenlokaal': ['functiebovenlokaal'],
 'old_id functielokaal': ['functielokaal'],
 'old_idlokaal': ['lokaal'],
 'old_idprovincie': ['provinciaal'],
 'old_idregio': ['regionaal'],
 'old_idstand': ['stand'],
 'old_lokaal': ['aanstelling'],
 'old_provinciaal': ['aanstelling'],
 'old_regio': ['aanstelling'],
 'old_stand': ['aanstelling'],
 'old_vertegenwoordigend': ['aanstelling']}

In [47]:
# reverse map the ids and references we have to change
graph=idmappings
transposed_graph = defaultdict(list)
for node, neighbours in graph.items():
    for neighbour in neighbours:
        transposed_graph[neighbour.lower()].append(node.lower())
#transposed_graph

In [48]:
reftables = [t for t in tblregister if tblregister[t]['is_reference'] is True]
list(set(reftables))

['lokaal',
 'bron',
 'functie',
 'adellijketitel',
 'functiebovenlokaal',
 'stand',
 'provinciaal',
 'academischetitel',
 'functielokaal',
 'regionaal',
 'college']

In [49]:
# this function now converts selected columns like function and titles to lowercase 

clean_references = {}


In [50]:
references = {}
for tbln in tblregister: 
    print(f"working on {tbln}")
    uniqcolumn = tblregister[tbln]['uniq'], 
    old_id_column = tblregister[tbln]['old_id']
    if tbln in reftables:
        updated_table = get_unique_lower(tbln, joined_tables, tblregister)
        ntbl = updated_table['newtable']
        ntbl.reset_index(inplace=True, drop=True)
        references[tbln] = ntbl
    else:
        ntbl = joined_tables[tbln]
        # ntbl.reset_index(inplace=True)
        uniqcolumn = tblregister[tbln]['id']
    if not old_id_column:
        ntbl.rename(columns={'index':f"{tbln}_id"}, inplace=True)
    try:
        if tblregister[tbln]['is_reference'] is False:
            clref = make_idmapping(ntbl, old_id_column=old_id_column, is_nested=False)
        else:
            clref = make_idmapping(ntbl, old_id_column=old_id_column, is_nested=True)
    except TypeError: 
        clref = alt_idmapping(ntbl, val_column=uniqcolumn, old_id_column=old_id_column)
    except KeyError:
        pass
    clean_references[tbln] = clref    
    print (f"updated {tbln}")
    


working on persoon
updated persoon
working on aliassen
updated aliassen
working on bronregentdetails
updated bronregentdetails
working on aanstelling
updated aanstelling
working on academischetitel
updated academischetitel
working on adellijketitel
updated adellijketitel
working on bron
updated bron
working on college
updated college
working on functie
updated functie
working on functiebovenlokaal
updated functiebovenlokaal
working on functielokaal
updated functielokaal
working on lokaal
updated lokaal
working on provinciaal
updated provinciaal
working on regionaal
updated regionaal
working on stand
updated stand


In [51]:
# all tables with references have to be updated
updatetables = [t for t in tblregister if 'reftables' in tblregister[t]]

In [52]:
joined_tables['aanstelling'].columns

Index(['aanstelling_id', 'id', 'periode', 'idregent', 'idfunctie', 'idcollege',
       'begindag', 'beginmaand', 'beginjaar', 'einddag', 'eindmaand',
       'eindjaar', 'vertegenwoordigend', 'provinciaal', 'opmerkingen', 'stand',
       'lokaal', 'regio', 'old_id', 'old_idregent', 'old_idfunctie',
       'old_idcollege', 'old_lokaal', 'old_provinciaal', 'old_regio',
       'old_stand', 'old_vertegenwoordigend', 'van_als_bekend',
       'tot_als_bekend', 'van', 'tot'],
      dtype='object')

In [53]:
nwetabellen = {}
worktable = None
for tbln in tblregister:
    if tbln in updatetables:
        worktable = references.get(tbln)
        updated_reference_columns = {}
        if worktable is None:
            worktable = joined_tables[tbln]
        for reference in tblregister[tbln]['reftables'].keys():
            referencecolumn = tblregister[tbln]['reftables'][reference]
            workreference = clean_references.get(reference)
            if workreference is not None:
                column = tblregister[tbln]['reftables'][reference] #['id']
                uniqcolumn = tblregister[reference]['uniq']
                # print(templ.substitute(tbln=tbln, referencecolumn=referencecolumn, reference=reference, column=column, uniqcolumn=uniqcolumn))
                columnlist = list(worktable.columns)+[f'{reference}_id']
                maptable = clean_references[reference]

                if len(worktable[column][worktable[column].str.contains('.0')])>0:
                    maptable = {k+'.0': v for k,v in maptable.items()}
                mapped_table = worktable[column].map(maptable)
                worktable[f'{reference}_id'] = mapped_table
                try:
                    print(f'updating {tbln} with {reference}')
                    worktable[f'{reference}_id'] = mapped_table
                    worktable = worktable.copy()
                    # worktable[f'{reference}_id'] = worktable[column].map(maptable)
                    # worktable = worktable.merge(workreference[[column, f'{rtbl}_id']], left_on=referencecolumn, right_on=column, how='left')
                    mclist = [c for c in columnlist if c in worktable.columns]
                    updatedworktable = worktable[mclist]
                except KeyError:
                    print(f'could not update {tbln} with {column}', worktable.columns)
            if reference not in nwetabellen.keys():
                nwetabellen[reference] = references[reference]
            worktable = updatedworktable
            worktable.reset_index(drop=True, inplace=True)
        nwetabellen[tbln] = worktable
        print('nwetabellen updated with ', tbln)
    else:
        nwetabellen[tbln] = references[tbln]

# we need to fix some more

updating persoon with academischetitel
updating persoon with adellijketitel
nwetabellen updated with  persoon
updating aliassen with persoon
nwetabellen updated with  aliassen
updating bronregentdetails with bron
updating bronregentdetails with persoon
nwetabellen updated with  bronregentdetails
updating aanstelling with college
updating aanstelling with functie
updating aanstelling with lokaal
updating aanstelling with provinciaal
updating aanstelling with regionaal
updating aanstelling with stand
updating aanstelling with persoon
nwetabellen updated with  aanstelling


In [54]:
nwetabellen['aanstelling'].columns

Index(['aanstelling_id', 'id', 'periode', 'idregent', 'idfunctie', 'idcollege',
       'begindag', 'beginmaand', 'beginjaar', 'einddag', 'eindmaand',
       'eindjaar', 'vertegenwoordigend', 'provinciaal', 'opmerkingen', 'stand',
       'lokaal', 'regio', 'old_id', 'old_idregent', 'old_idfunctie',
       'old_idcollege', 'old_lokaal', 'old_provinciaal', 'old_regio',
       'old_stand', 'old_vertegenwoordigend', 'van_als_bekend',
       'tot_als_bekend', 'van', 'tot', 'college_id', 'functie_id', 'lokaal_id',
       'provinciaal_id', 'regionaal_id', 'stand_id', 'persoon_id'],
      dtype='object')

In [55]:
for tbl in nwetabellen:
    idnm = f'{tbl}_id'
    if idnm not in nwetabellen[tbl].columns:
        nwetabellen[tbl].reset_index(inplace=True, drop=True)
        nwetabellen[tbl][idnm] = nwetabellen[tbl].index

In [58]:
aanstelling = nwetabellen['aanstelling']
persoon = nwetabellen['persoon']

In [59]:
# a bit of checking. If the values in the two tables are different this throws an assertion error
compr = aanstelling.persoon_id.value_counts() == nwetabellen['aanstelling'].persoon_id.value_counts()
assert(len(compr.loc[~compr])==0)

In [60]:
from collections import defaultdict
def map_dict(rv,):
    rv = re.sub('[a-z]+_<NA>','',rv)
    nk = [e for e in list(set(re.split('_[0-9]+', rv))) if e!='']
    # nk = [re.split('_[0-9]+','',i) for i in rv]
        # print(nk)
    result = {} #defaultdict(list)
    for k,v in pmap.items():
        if k in nk:
            result[v] = 1
        else:
            result[v] = 0
    return pd.Series(result)


In [61]:
pmap = {'me':1,'republiek':2,'batfra':3,'negentiende_eeuw':4,'divperioden':5}

In [62]:
# load reference tables first
torder = ['academische_titel', 
          'adellijke_titel',  
          'bron',
          'instelling', 
          'functie', 
          'lokaal', 
          'provincie', 
          'regio', 
          'stand', 
          'persoon', 
          'alias',
          #'aanstelling', 
          'bron_details']

In [63]:
extab = {}
for tbl in nwetabellen:
    t = nwetabellen[tbl]
    oldcols = [c for c in t.columns if type(c)==str and 'old_' in c]
    # print(tbl, oldcols)
    target = columnmaps[tbl]
    if target:
        if f"{tbl}_id" in t.columns:
            if 'id' in target:
                del target['id']
        xtab = t.rename(columns=target)
        drops = [c for c in xtab.columns if c not in target.values()]
        tabn = exporttabellen.get(tbl) or ''
        if 'naam' in xtab.columns:
            xtab.fillna({'naam':''}, inplace=True)
        if len(oldcols)>0:
            if len(oldcols) == 1:
                perioden = t[oldcols[0]].str.join('').apply(map_dict)
            else:
                perioden = t[oldcols].sum(axis=1).apply(map_dict)
        xtab = pd.concat([xtab, perioden],axis=1)
        extab[tabn] = xtab.drop(columns=drops)
        


In [64]:
nwetabellen['aanstelling'].columns

Index(['aanstelling_id', 'id', 'periode', 'idregent', 'idfunctie', 'idcollege',
       'begindag', 'beginmaand', 'beginjaar', 'einddag', 'eindmaand',
       'eindjaar', 'vertegenwoordigend', 'provinciaal', 'opmerkingen', 'stand',
       'lokaal', 'regio', 'old_id', 'old_idregent', 'old_idfunctie',
       'old_idcollege', 'old_lokaal', 'old_provinciaal', 'old_regio',
       'old_stand', 'old_vertegenwoordigend', 'van_als_bekend',
       'tot_als_bekend', 'van', 'tot', 'college_id', 'functie_id', 'lokaal_id',
       'provinciaal_id', 'regionaal_id', 'stand_id', 'persoon_id'],
      dtype='object')

In [66]:
extab['aanstelling'].rename(columns=columnmaps['aanstelling'])

Unnamed: 0,id,id.1,periode,begindag,beginmaand,beginjaar,einddag,eindmaand,eindjaar,vertegenwoordigend,...,lokaal_id,provincie_id,regio_id,stand_id,persoon_id,1,2,3,4,5
0,0,1,2,2.0,10.0,1801.0,8.0,12.0,1801.0,0,...,,,,,241.0,0,0,1,0,0
1,1,2,2,16.0,7.0,1799.0,2.0,10.0,1801.0,0,...,,,,,241.0,0,0,1,0,0
2,2,3,2,3.0,12.0,1801.0,26.0,5.0,1802.0,0,...,,,,,241.0,0,0,1,0,0
3,3,8,2,1.0,12.0,1807.0,1.0,1.0,1809.0,0,...,,,,,127.0,0,0,1,0,0
4,4,9,2,19.0,8.0,1809.0,31.0,12.0,1811.0,0,...,,,,,127.0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39806,22386,24812,1,,,1684.0,,,1712.0,1,...,,,,,15728.0,0,1,0,0,0
39807,22387,25443,1,,,1783.0,,,1795.0,1,...,,,,,14241.0,0,1,0,0,0
39808,22388,21660,1,,,1604.0,,,1604.0,1,...,,,,,15723.0,0,1,0,0,0
39809,22389,26358,1,,,1782.0,,,1795.0,1,...,,,,,15757.0,0,1,0,0,0


In [67]:
extab['persoon'] = extab['persoon'].replace('NaT', pd.NA)
extab['persoon'] = extab['persoon'].replace('Null', pd.NA)

In [68]:
# pandas cannot handle period in to_sql so we convert it to text

for column in ['geboortedatum', 'overlijdensdatum']:
    persoon[column] = persoon[column].astype('str')
    persoon[column] = persoon[column].apply(lambda x: 'Null' if isinstance(x, str) and x in ['','NaT','Null','None'] else x)
    
for column in ['van', 'tot']:
    #aanstelling[column] = aanstelling[column].fillna(0)
    aanstelling[column] = aanstelling[column].astype('str')
    aanstelling[column] = aanstelling[column].apply(lambda x: 'Null' if isinstance(x, str) and x in ['','NaT','Null','None'] else x)

In [69]:
aanstelling.columns

Index(['aanstelling_id', 'id', 'periode', 'idregent', 'idfunctie', 'idcollege',
       'begindag', 'beginmaand', 'beginjaar', 'einddag', 'eindmaand',
       'eindjaar', 'vertegenwoordigend', 'provinciaal', 'opmerkingen', 'stand',
       'lokaal', 'regio', 'old_id', 'old_idregent', 'old_idfunctie',
       'old_idcollege', 'old_lokaal', 'old_provinciaal', 'old_regio',
       'old_stand', 'old_vertegenwoordigend', 'van_als_bekend',
       'tot_als_bekend', 'van', 'tot', 'college_id', 'functie_id', 'lokaal_id',
       'provinciaal_id', 'regionaal_id', 'stand_id', 'persoon_id'],
      dtype='object')

## Write to Database

In [70]:
import json
with open('connection.json','r') as infl:
    con = json.load(infl)
    
connection_string = con['raa_out']

In [71]:
connection_string

'mysql+pymysql://rik:X0chi@localhost/raa_nw'

In [72]:
import sys, subprocess, os
from io import StringIO
import pandas as pd
VERBOSE = True

from sqlalchemy import create_engine, inspect
from sqlalchemy.schema import CreateSchema

engine = create_engine(connection_string, echo=False)

In [73]:
#from https://stackoverflow.com/questions/50927740/sqlalchemy-create-schema-if-not-exists
from sqlalchemy_utils.functions import database_exists, create_database

if not database_exists(connection_string):
    create_database(connection_string)

In [74]:


def mdb_to_sql(engine, database_path, prefix='',basedir='.'):
    """via pandas"""
    subprocess.call(["mdb-schema", database_path, "mysql"])
    # Get the list of table names with "mdb-tables"
    table_names = subprocess.Popen(["mdb-tables", "-1", database_path],
                                   stdout=subprocess.PIPE).communicate()[0]
    tables = table_names.splitlines()
    sys.stdout.flush()
    # Dump each table as a stringio using "mdb-export",
    out_tables = {}
    prefix = prefix.replace(' ', '_') + '_'
    for rtable in tables:
        table = rtable.decode()
        if VERBOSE: print('running table:',table)
        if table != '':
            tname = prefix+table
            if VERBOSE: print("Dumping " + table)
            contents = subprocess.Popen(["mdb-export", database_path, table],
                                        stdout=subprocess.PIPE).communicate()[0]
            temp_io = StringIO(contents.decode("utf8")) # not f*cking latin1
            #print(table, temp_io)
            table = pd.read_csv(temp_io)
            try:
                table.to_sql(con=engine, name=tname, index=False, if_exists="replace")
                t_out_name = os.path.join(basedir,'mdbdump',tname)+'.csv'
                print('outfile', t_out_name)
                table.to_csv(t_out_name,sep="\t")
                print ("written: ", tname)
                done.append(tname)
            except:
                print('fout', tname)
                # table.to_sql(con=engine, name=tname, index=False, if_exists="replace")
                # t_out_name = os.path.join(basedir,'mdbdump',tname)+'.csv'
                # table.to_csv(t_out_name,sep="\t")


In [75]:
[c for c in extab['aanstelling'] if c not in columnmaps['aanstelling'].values()]

[1, 2, 3, 4, 5]

In [76]:
from pymysql import OperationalError


engine = create_engine(connection_string, echo=False)

for tbl in torder:
    table = extab[tbl]
    table.to_sql(con=engine, name=tbl, index=False, if_exists="append")

In [77]:
# we do aanstelling separately because duplicate ids cause too much trouble and we do not need them for anything anyway
aanstelling = extab['aanstelling']
truncated_aanstelling = aanstelling[[c for c in aanstelling.columns if c not in ['id']]]

In [78]:
for column in ['van', 'tot']:
    #aanstelling[column] = aanstelling[column].fillna(0)
    truncated_aanstelling[column] = aanstelling[column].astype('str')
    truncated_aanstelling[column] = aanstelling[column].apply(lambda x: pd.NA if isinstance(x, str) and x in ['','NaT','Null','None'] else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  truncated_aanstelling[column] = aanstelling[column].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  truncated_aanstelling[column] = aanstelling[column].apply(lambda x: pd.NA if isinstance(x, str) and x in ['','NaT','Null','None'] else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t

In [80]:
truncated_aanstelling.to_sql(con=engine, name='aanstelling', index=False, if_exists="append")

39811

In [81]:
dboutdir = './targetdb_dump'
for table in extab:
    tn = os.path.join(f"{dboutdir}_{table}.pkl")
    extab[table].to_pickle(tn)
truncated_aanstelling.to_pickle(f"{dboutdir}_aanstelling.pkl")