In [1]:
# Imports
import re
import spacy
import string
import numpy as np
import pandas as pd
import dateparser
from datetime import datetime, timezone
import pickle

In [2]:
save_path = 'data/minverzoek.pkl.gz'

In [3]:
min2023 = pd.read_pickle('data/minextracted.pkl.gz', compression='gzip')

==== Verzoeksdatum ====


In [44]:
# extract dates functie uit notebook van Maarten Marx 'DutchDatumParser', settings toegevoegd
nlp = spacy.load("nl_core_news_lg")
date_label = ['DATE']

def extract_dates(text):
    doc = nlp(text)
    results = [ent.text for ent in doc.ents if ent.label_ in date_label]
    results= {s:dateparser.parse(s, settings={'REQUIRE_PARTS': ['day', 'month', 'year']}) for s in results}
    return results

# datums als '2 1 juni 2023' omzetten naar '21 juni 2023' en double whitespace weghalen
def join_numbers(s):
    s = ' '.join(s.split())
    return re.sub(r"(\s?\d)(\s)(\d\s)", '\g<1>\g<3>', s)

def sort_dates(dates):
    if dates != None and len(dates) >= 1:
        dates = sorted([x.replace(tzinfo=None) for x in dates.values() if not x is None])
    else:
        return None
    
    # onrealistische datums weghalen
    dates = [x for x in dates if (x.year > 1950 and x.year < 2050)]
    
    if len(dates) == 0 or dates == {}:
        return None
    return dates

# verzoeksdatum staat soms naast de dagtekening, verzoekdatum is dan altijd de vroegste datum
def grab_earliest_date(dates):
    sorted_dates = sort_dates(dates)
    if sorted_dates != None:
        return min(sorted_dates)
    else:
        return sorted_dates

# context van de verzoeksdatum ophalen
def regex_context_verzoek(row):
    if row.name[1] == 1 or row.name[1] == 2 or row.name[1] == 3:
        s = row['foi_bodyTextOCR']
        if type(s) != str:
            return None
        else:
            search_result = re.search(r"Geachte.{0,10}(\s+\S+){15}", s, re.IGNORECASE)
            if search_result == None:
                search_result = re.search(r"In uw.{0,10}(\s+\S+){15}", s, re.IGNORECASE)
            if search_result == None:
                search_result = re.search(r"[^.]*verzoek[^.]*\.", s, re.IGNORECASE)
            if search_result == None:
                search_result = re.search(r"[^.]*verzocht[^.]*\.", s, re.IGNORECASE)
            if search_result == None:
                search_result = re.search(r"[^.]*ontving[^.]*\.", s, re.IGNORECASE)
            if search_result != None:
                result = search_result[0].translate(str.maketrans('', '', string.punctuation.replace('-', ''))).lower()
                return join_numbers(result)
            else:
                return None

# direct alleen de verzoekdatum teruggeven, in plaats van de context
def extract_verzoek(row):
    if row.name[1] == 1 or row.name[1] == 2 or row.name[1] == 3:
        s = row['foi_bodyTextOCR']
        regex_queries = [r"Geachte.{0,10}(\s+\S+){15}", 
                     r"In uw.{0,10}(\s+\S+){15}", 
                     r"[^.]*verzoek[^.]*\.", 
                     r"[^.]*verzocht[^.]*\.", 
                     r"[^.]*ontving[^.]*\."]
        if type(s) != str:
            return None 
        else:
            for query in regex_queries:
                search_result = re.search(query, s, re.IGNORECASE)
                if search_result != None:
                    date = extract_dates(join_numbers(search_result[0].translate(str.maketrans('', '', string.punctuation.replace('-', ''))).lower()))
                    if len(date) >= 1:
                        return grab_earliest_date(date)
        return None
    else:
        None

In [45]:
# extract verzoek en de context vol alle paginas met de verzoeksdatum, dit duurt even
min2023['verzoek_context'] = min2023.apply(lambda x: regex_context_verzoek(x), axis=1)

min2023['verzoek_datum'] = min2023.apply(lambda x: extract_verzoek(x), axis=1)

min2023['verzoek_context'] = min2023['verzoek_context'].groupby(level=0).fillna(method='ffill')
min2023['verzoek_context'] = min2023['verzoek_context'].groupby(level=0).fillna(method='bfill')

min2023['verzoek_datum'] = min2023['verzoek_datum'].groupby(level=0).fillna(method='ffill')
min2023['verzoek_datum'] = min2023['verzoek_datum'].groupby(level=0).fillna(method='bfill')

In [7]:
min2023.to_pickle(path=save_path, compression='gzip')