In [47]:
import spacy
from IPython.display import Markdown, display
from IPython.display import clear_output
import pandas as pd
from spacy.matcher import Matcher
import re
from spacy.tokens import Span, DocBin
from spacy.matcher import Matcher

spacy.prefer_gpu()
nlp = spacy.load("nl_core_news_lg")
df = pd.read_csv('..\\data\\ocred\\files_df.csv', index_col = 0)

In this notebook, I'll be testing out proximity search for dates. The process is as follows: find dates with the extractor made in [this notebook](https://github.com/JustinBon/thesis/blob/main/experiments/Extractor%201%20dates.ipynb), use the span it gives and look at the tokens 3 before and 3 after to see if there is a token that indicates what the date means. See the list dateTypes in the cell below for all the words I'll be looking for. This can hopefully give an indication of meaning. Small problem is that I do not have any labeled data for this...

In [52]:
# set variables
months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december']
days = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag', 'zondag']
dateTypes = ['datum', 'verzonden', 'sent', 'date', 'ingediend', 'afgehandeld', 'update']

datesPattern = [{"LOWER" : {"IN" : days}, "OP" : "?"}, 
           {"IS_DIGIT": True}, 
           {"LOWER" : {"IN" : months}},
           {"IS_DIGIT": True, "OP" : "?"}]

datesPatternContext = [{"LOWER" : {"IN" : dateTypes}},
           {"IS_PUNCT" : True, "OP" : "?"},
           {"LOWER" : {"IN" : days}, "OP" : "?"}, 
           {"IS_DIGIT": True}, 
           {"LOWER" : {"IN" : months}},
           {"IS_DIGIT": True, "OP" : "?"}]

In [68]:
# gets context for every found date
def getContext(match, doc):
    dateTypes = ['datum', 'verzonden', 'sent', 'date', 'ingediend', 'afgehandeld', 'update']
    
    # look 3 tokens before and after match
    span = doc[match[0] - 3:match[1] + 3]
    
    # check if one of the indicators are in the context
    for token in span:
        
        # return context if indicator is found
        if str(token) in dateTypes:
            return (span)
    
    return None

def findDates(text, pattern, nlp):
    
    # add pattern
    matcher = Matcher(nlp.vocab)
    matcher.add("Dates", [pattern])
    
    # create doc and find matches
    text = str(text)    
    doc = nlp(text)
    matches = matcher(doc)

    results = []
    previous = 0
    temp = []
    
    # get all matches in a temp list
    for match_id, start, end in matches:
        temp.append((start, end, doc[start:end].text))
    
    # this loop removes duplicates like:
    # 12 maart and maandag 12 maart
    # the most complete date is kept, maandag 12 maart in this example 
    
    for i in range(len(temp)):
        
        # check if this is final loop
        if i + 1 == len(temp):
            
            # get contexts and add to results
            context = getContext(temp[i], doc)
            if context:
                results.append(context)
            break
        
        # if the next match starts at the same index it is a duplicate so skip
        if temp[i][0] == temp[i+1][0]:
            continue
            
        # if the end of the prvious match is the same index, skip
        if temp[i][2] in temp[i - 2][2] and temp[i][2] != temp[i - 2][2]:
            continue
        
        # if loop gets here, its either not a duplicate or the the most complete date of duplicates.
        context = getContext(temp[i], doc)
        if context:
            results.append(context)
    
    # return every unique result
    return set(results)



def findContext(df, datesPattern, nlp):
    
    # run findDates on all rows
    dates = df.text.apply(lambda x: findDates(x, datesPattern, nlp))
    
    # show results
    for date in dates:
        if date:
            print(date)


In [69]:
findContext(df.sample(n=1000, random_state=1), datesPattern, nlp)

{de update van 18 maart 14:00 zijn er}
{de update van 13 maart 14:00 zijn er}
{de update van 13 maart 14:00 zijn er}
{: Up date 23 maart

Dank Annelies, : Up date 23 maart

Ha collega's}
{20

datum 4 februari 2020
onze ref}
{
datum: 13 maart 2020

Geachte Minister}
{de update van 25 maart 10.00 uur zijn}
{en worden uiterlijk 7 oktober bij AEP ingediend, worden
uiterlijk 7 oktober bij AEP ingediend}
{de update van 10 maart om 14:00 uur}
{Covid-19, update 16 april

Er is}
{de update van 25 maart 10.00 uur zijn}
{de update van 24 maart 10:00 uur zijn}


As a comparison of this proximity search, below is the pattern made in [this notebook](https://github.com/JustinBon/thesis/blob/main/experiments/Extractor%201%20dates.ipynb) that includes a check if the previous word is an indicator word. This is shown below

In [66]:
def findPattern(text, pattern, nlp):  

    # add pattern
    matcher = Matcher(nlp.vocab)
    matcher.add("Dates", [pattern])
    
    # create doc and find matches
    text = str(text)    
    doc = nlp(text)
    matches = matcher(doc)

    results = []
    previous = 0
    temp = []
    
    # get all matches in a temp list
    for match_id, start, end in matches:
        temp.append((start, end, doc[start:end].text))
    
    results = []
    
    for i in range(len(temp)):
        
        # check if this is final loop
        if i + 1 == len(temp):
            
            # get contexts and add to results
            results.append(temp[i][2])
            break
        
        # if the next match starts at the same index it is a duplicate so skip
        if temp[i][0] == temp[i+1][0]:
            continue
            
        # if the end of the prvious match is the same index, skip
        if temp[i][2] in temp[i - 2][2] and temp[i][2] != temp[i - 2][2]:
            continue
        
        # if loop gets here, its either not a duplicate or the the most complete date of duplicates.
        results.append(temp[i][2])
    
    # return every unique result
    return results

def findContext2(df, datesPattern, nlp):
    
    # run findDates on all rows
    dates = df.text.apply(lambda x: findPattern(x, datesPattern, nlp))
    
    # show results
    for date in dates:
        if date:
            print(list(set(date)))


In [67]:
findContext2(df.sample(n=1000, random_state=1), datesPatternContext, nlp)

['Verzonden: donderdag 23 januari 2020']
['Verzonden: vrijdag 20 maart 2020']
['Verzonden: zondag 29 maart 2020']
['Verzonden: dinsdag 11 februari 2020']
['Sent: woensdag 5 februari 2020']
['Verzonden: maandag 23 maart 2020', 'Verzonden: vrijdag 20 maart 2020']
['Verzonden: donderdag 19 november 2020']
['Sent: zondag 30 mei 2021']
['Verzonden: vrijdag 1 mei 2020']
['Verzonden: woensdag 1 april 2020']
['Datum: zondag 14 februari 2021', 'Verzonden: zondag 14 februari 2021']
['Verzonden: zaterdag 29 februari 2020']
['Verzonden: vrijdag 31 januari 2020']
['Datum: dinsdag 15 december 2020']
['Datum: woensdag 15 april 2020']
['Datum: dinsdag 2 februari 2021']
['date 23 maart']
['Verzonden: zondag 8 maart 2020']
['Sent: zondag 29 maart 2020']
['Sent: dinsdag 24 maart 2020', 'Sent: woensdag 18 maart 2020', 'Verzonden: woensdag 18 maart 2020', 'Verzonden: dinsdag 17 maart 2020']
['Verzonden: woensdag 13 mei 2020']
['Sent: maandag 30 maart 2020', 'Verzonden: dinsdag 31 maart 2020', 'Verzonden: m

These results show that the normal pattern matcher without the proximity search finds way more dates on the same sample. For dates then, this pattern can be used to find when a document was made or sent.