In [51]:
import spacy
from IPython.display import Markdown, display
import os
from IPython.display import clear_output
from spacy import displacy
from itertools import combinations
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
from spacy.tokens import Span, DocBin
import random
import pprint

spacy.prefer_gpu()
nlp = spacy.load("nl_core_news_lg")
df = pd.read_csv('..\\data\\ocred\\files_df.csv', index_col = 0)

This notebook supplies the spacy ner model with more custom training data. This data is manually labeled by running the showText function. 

To see if this works better than the rule based matcher, I randomly selected 500 pages and labeled every mention of a ministery. The showTokens function prints out the tokens with its index above to make this a bit easier seeing that spacy requeres a Span of tokens to be added. The span consists of a doc (the current page), a start and end index, and a label (ministery). ShowTokens can also highlight certain words, in this case ministerie, ministeries and all of the abbriviations of all ministeries.




In [2]:
def getMinisteries():
    page = requests.get('https://nl.wikipedia.org/wiki/Lijst_van_Nederlandse_ministeries')
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("td")[-1]

    results.find_all('a', href = True)
    wikis = {}

    abrr = []
    for item in str(results.find_all('p')[0]).split('\n')[:-1] + str(results.find_all('p')[1]).split('\n')[1:-1]:
        temp = re.findall('(?<=\()(.*?)(?=\))', item)
        if temp == []:
            abrr.append(None)
        elif temp[-1] == 'Nederland':
            abrr.append(None)
            if 'Overzeese Gebiedsdelen' in item:
                abrr.append(None)
        else:
            abrr.append(temp[-1].replace('&amp;', '&'))


    counter = 0
    for ministerie in results.find_all('a')[:12]:
        wikis[ministerie.text] = {'Link': 'https://nl.wikipedia.org' + ministerie['href'], 'Abbriviation' : abrr[counter]}
        counter += 1
    
    return wikis


In [3]:
def printHilight(string):
    print('\x1b[1;31m'+string + ' ' +'\x1b[0m', end='')

def showTokens(text, allMinisteries):

    try:
        text = re.sub(' +', ' ', text)
        text = text.replace('\n', ' ')
    except:
        pass
    doc = nlp(str(text))
    # get list of entities as strings
    listTokens = [str(ent.text) for ent in doc.ents]

    # control variables
    nToken = 0
    tempTokens = []
    lenTemp = 0

    # loop over tokens
    for i in range(len(doc)):

        # get string representation of token and add to a temporary list with token index
        t = str(doc[i])
        tempTokens.append((t, nToken))

        # increase n tokens and length of all strings in tempTokens
        nToken += 1
        lenTemp += (len(t) + 1)

        # once lenght of all strings in tempTokens is 100 or larger, start printing the line
        if lenTemp >= 100 or i >= len(doc) - 1:

            # print the index for every token
            for word in tempTokens:

                # if index number is more chars than the token, dont print the index number
                if len(word[0]) < len(str(word[1])):
                    print(' ' * (len(word[0]) + 1), end='')

                # else just print index number plus a number of spaces
                else:
                    print(word[1], ' ' * (len(word[0]) - len(str(word[1]))), end='')
            print('')

            # print all tokens
            for word in tempTokens:

                # highlight tokens in red if named entity
                if word[0].lower() in allMinisteries + ['ministerie', 'ministeries']:
                    printHilight(word[0])
                else:
                    print(word[0] + ' ', end='')

            # add some space between lines
            print('\n\n')

            # reset control variables
            tempTokens = []
            lenTemp = 0
    
    # inputs:
    # no input: next page
    # q: quit 
    # remove: remove last label
    # number1 number2: adds a label from span number1 to number2
    # everything else is invalid, you will be prompted again
    
    
    newEnts = []
    while True:
        x = input()
        if x == '':
            break
        elif x == 'q':
            raise Exception("Stopped the program")
        elif x == 'remove':
            del newEnts[-1]
            print('Removed')
            continue
        
        try:
            x = x.split()
            newEnts.append(Span(doc, int(x[0]), int(x[1]), label='MINISTERIE'))
            print(doc[int(x[0]):int(x[1])])
        except:
            print('Two numbers seperated by a space')
            continue
            
        
    try:
        doc.ents = newEnts
        clear_output()
    except:
        pass
        
    return doc

In [4]:
def showText(df, n):
    
    ministeries = getMinisteries()
    abrr = [ministeries[x]['Abbriviation'] for x in ministeries if ministeries[x]['Abbriviation'] != None]
    allMinisteries = list(ministeries.keys()) + abrr
    allMinisteries = [x.lower() for x in allMinisteries]
    
    samples = df.sample(n)
    docs = list(samples.text.apply(lambda x: showTokens(x, allMinisteries)))
    return docs

In [89]:
docs = []

for i in range(30):
    docs += showText(df, 10)
    print(f'done {i + 1}0')

done 300


In [90]:
random.shuffle(docs)
train_docs = docs[:len(docs) // 2]
dev_docs = docs[len(docs) // 2:]

# Create and save a collection of training docs
train_docbin = DocBin(docs=train_docs)
train_docbin.to_disk("..\\data\\spacy labeled\\train2.spacy")
# Create and save a collection of evaluation docs
dev_docbin = DocBin(docs=dev_docs)
dev_docbin.to_disk("..\\data\\spacy labeled\\dev2.spacy")

In [85]:
# do this in folder with the test and train data files

# generate default confic file   
# $ python -m spacy init config ./config.cfg --lang nl --pipeline ner

# training 
# python -m spacy train ./config.cfg --output ./output --paths.train train2.spacy --paths.dev dev2.spacy

nlp1 = spacy.load("..\\data\\spacy labeled\\output\\model-last")


In [5]:
def getLabeledData():
    with open('..\\data\\ministeries.txt', 'r', encoding='utf-8') as f:
        m = f.read()
        m = m.split('\n\n')
        
    labeledMinisteries = {}
    for file in m:
        lines = file.split('\n')
        
        labeledMinisteries[lines[0]] = {}
        
        for line in lines[1:]:
            line = line.lower()
            if line in labeledMinisteries[lines[0]]:
                labeledMinisteries[lines[0]][line] += 1
            else:
                labeledMinisteries[lines[0]][line] = 1


    return labeledMinisteries


In [67]:
def extractMinisteries(df, files):
    nlp1 = spacy.load("..\\data\\spacy labeled\\output\\model-last")
    
    found = {}
    
    for file in files:
        found[file] = {}
        
        with open('..\\data\\covid wob text without ocr\\' + file +'.txt', 'r', encoding='utf-8') as f:
            text = f.read()
            text = re.sub(' +', ' ', text)
        
        doc = nlp1(text)
        for ent in doc.ents:
            ent = str(ent).lower()
            ent = ent.replace('\n', ' ')
            
            if 'ministerie van ' in ent:
                ent = ent[15:]
            
            if ent in found[file]:
                found[file][ent] += 1
            else:
                found[file][ent] = 1
        
    
    return found
    

In [68]:
found = extractMinisteries(df, getLabeledData().keys())

In [69]:
def evaluate(df, found=None):
    labeled = getLabeledData()
    
    if not found:
        found = extractMinisteries(df, labeled.keys())
    
    tp = 0
    fp = 0
    fn = 0
    fpList = []
    fnList = []
    
    
    for file in found:
        
        for ministerie in found[file]:
            if ministerie not in labeled[file]:
                fp += found[file][ministerie]
                fpList.append(ministerie)
                
            elif found[file][ministerie] == labeled[file][ministerie]:
                tp += found[file][ministerie]
                
            elif found[file][ministerie] > labeled[file][ministerie]:
                tp += labeled[file][ministerie]
                fp += found[file][ministerie] - labeled[file][ministerie]
                fpList.append(ministerie)
                
            elif found[file][ministerie] < labeled[file][ministerie]:
                tp += found[file][ministerie]
                fn += labeled[file][ministerie] - found[file][ministerie]
                fnList.append(ministerie)
    
        for ministerie in labeled[file]:
            if ministerie not in found:
                fn += labeled[file][ministerie]
                fnList.append(ministerie)
                
    
    for file in found.keys():
        print(file)
        pprint.pprint(found[file])
        print('')
        pprint.pprint(labeled[file])
        print('')
        print('')
        print('')

    print(tp, fp, fn)
    print('recall', tp / (tp + fn))
    print('precision', tp / (tp + fp))
    print('')
    

In [70]:
evaluate(df, found = found)

0068ed0b40cca6270f857d2614cc63c0_besluit.pdf
{'financiën': 1, 'szw': 1, 'vws': 2}

{'economische zaken en klimaat': 1,
 'ezk': 1,
 'financiën': 1,
 'ienw': 1,
 'infrastructuur en w aterstaat': 1,
 'sociale zaken en werkgelegenheid ': 1,
 'szw': 1,
 'volksgezondheid, welzijn en sport': 1,
 'vws': 1}



0068ed0b40cca6270f857d2614cc63c0_document.pdf
{'financiën': 1}

{'financiën': 1}



0335b3f498dbbd7c537ad23abe8c08dc_deelbesluit-1-wob-verzoek-dd-11-augustus-2021-inzake-het-europees-herstelfonds.pdf
{'defensie': 1}

{'buitenlandse zaken': 2, 'economische zaken en klimaat': 1, 'financiën': 2}



07e2b274045cb5b4f54371a3c905cae9_wobverzoek-mccb-catshuis.pdf
{'az': 1,
 'bz': 1,
 'bzk': 5,
 'ezk': 5,
 'fin': 4,
 'i&w': 1,
 'j&v': 1,
 'jenv': 2,
 'justitie en veiligheid': 1,
 'ocw': 8,
 'sociale zaken en werkgelegenheiddirectie': 1,
 'szw': 46,
 'vws': 17}

{'ezk': 1,
 'justitie en veiligheid': 1,
 'sociale zaken en werkgelegenheid': 6,
 'szw': 1}



17967f10340f6de2a79ba984209b4a2c_besluit.p

## Conclusion

Hmmm these results are a bit weird. The calculated recall and precision are thrash but looking at the results in the output of the cell above is a bit more promising (the first dict after the file name is the results of the model, the second is the labeled data). There are definitely some big mistakes in the updated spacy model but it also found a lot more than I actually labeled (shows how good my labeling skills are...). 

The thing that the model does well compared to my rule based matcher is when there is a list of ministries like "de ministeries van ezk, vws en szw". These abbriviations are also caught a lot more with this model than with the rule based matcher. Take a look at the results for the file "stukken-bij-besluit-wob-verzoek-notities-besluitvorming-coronacrisis-20.pdf" for a good example of this.

Then the things that the model doesn't do well. Sometimes it recognizes a ministery as "ministerie van binnenlandse zaken" in stead of "ministerie van binnenlandse zaken en koninkrijksrelaties". It doesn't take extract the whole name of the ministery. A second place where it messes up is when it does the oppisite. In the last file in the cell above, it extracts "ezk zou hoeven te geschieden , maar dat u" while it only should have gotten "ezk".

In short, what the model finds seems to be mostly correct albeit not completely correct. 

I did not run the model on the ocr'ed text because the labeled data came from the text that was extracted using pypdf2. I did this to keep everything consistent