In [1]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

from IPython.display import Markdown, display
from IPython.display import clear_output

from itertools import combinations
import pandas as pd
from datetime import datetime

import requests
from bs4 import BeautifulSoup

import os
import re
import json

  'CUDA path could not be detected.'
  'CUDA path could not be detected.'


In [2]:
spacy.prefer_gpu()
nlp = spacy.load("nl_core_news_lg")

In [3]:
nlp_ministries = spacy.load("..\\data\\spacy labeled\\output\\model-last")
df = pd.read_csv('..\\data\\ocred\\files_df.csv', index_col = 0)

# dates
This first part will be the evalutation for the base dates extractor

In [4]:
months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december',
         'january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december',
         'jan', 'feb', 'mrt', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
days = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag', 'zondag',
       'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
sent = ['datum', 'verzonden', 'sent', 'date', 'received']

In [5]:
def printHilight(string):
    print('\x1b[1;31m'+string + ' ' +'\x1b[0m', end='')
    
def showMatches(doc, matches, regexMatches):   
    indexOfMatches = []
    for matchid, start, end in matches:
        for i in range(start, end):
            indexOfMatches.append(i)
            
    indexOfMatches = set(indexOfMatches)
    
    for token in doc:
        if token.i in indexOfMatches:
            printHilight(str(token.text))
        else:
            if token.text in regexMatches:
                printHilight(str(token.text))
            else:
                print(token, end=' ')
    
    return
    
    

In [6]:
def validate(dates, sep, pat):
    goodDates = []
    for date in dates:
        date = date.replace(' ', '')
        try:
            date = date.replace(sep, ' ')
            datetime.strptime(date, pat)
            goodDates.append(date.replace(' ', sep))
        except:
            try:
                if len(date.split(' ')) == 3 and len(date.split(' ')[2]) == 2:
                    datetime.strptime(date, '%d %m %y')
                    goodDates.append(date.replace(' ', sep))
            except:
                pass     
    return goodDates
            

def regexMatcher(text):
    results = []
    
    results += validate(re.findall('[0-3]{0,1}[0-9]\/[0-1]{0,1}[0-9]', text), '/', '%d %m')

    results += validate(re.findall('[0-3]{0,1}[0-9]\/[0-1]{0,1}[0-9]\/[0-9]{2,4}', text), '/', '%d %m %Y')

    results += validate(re.findall('[0-3]{0,1}[0-9]-[0-1]{0,1}[0-9]', text), '-', '%d %m')

    results += validate(re.findall('[0-3]{0,1}[0-9]-[0-1]{0,1}[0-9]-[0-9]{2,4}', text), '-', '%d %m %Y')
        
    return results

In [7]:
def inputHandling(message):
    while(True):
        i = input(message)
        if i == 'exit':
            return -1
        
        elif i == '':
            return 0
        
        try:
            i = int(i)
            return i
        
        except:
            print("input number, exit or nothing")
        

def evaluate(nlp):
    
    months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december',
         'january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december',
         'jan', 'feb', 'mrt', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    days = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag', 'zondag',
           'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
           'ma', 'di', 'wo', 'woe', 'do', 'vrij', 'za', 'zat', 'zo', 'vr']
    
    datesPattern = [{"LOWER" : {"IN" : days}, "OP" : "?"}, 
           {"IS_DIGIT": True}, 
           {"LOWER" : {"IN" : months}},
           {"IS_PUNCT" : True, "OP" : "?", "TEXT":'.'},
           {"IS_DIGIT": True, "OP" : "?"}]
    matcher = Matcher(nlp.vocab)
    matcher.add("Dates", [datesPattern])
    
    # cor = exact match, inc = match is wrong (wrong bounds or label), mis = missing match, spu = found something that isnt a mactch
    if os.path.isfile('..\\data\\results\\dates_results,json'):
        with open('..\\data\\results\\dates_results,json') as f:
            results = json.load(f)
        
        pass
    else:
        results = {
            'correct':0,
            'incorrect':0,
            'missing':0,
            'spurious':0,
            'america' :0,
            'ocr':0,
            'oneoff':0,
            'other':0
        }
    
    print(results)
    cases = ['correct','incorrect','missing','spurious','america','ocr','oneoff','other']
    documents = 0
    
    while(True):
        try:
            print(sum([results[x] for x in results.keys() if x in cases[:4]]), results['documents'])
            sample = df.sample(1)
            print(sample.name.values[0], sample.page.values[0], '\n\n')
            text = sample.text.values[0]
            text = re.sub('\n+', '\n', text)
            text = re.sub(' +', ' ', text)
            doc = nlp(text)
            matches = matcher(doc)

            regexMatches = regexMatcher(text)

            showMatches(doc, matches, regexMatches)

            for case in cases:
                result = inputHandling(case)
                if result == -1:
                    results['documents'] += documents
                    return results
                else:
                    results[case] += result
            documents += 1
            clear_output()
        except:
            print('error')
            continue
            

In [8]:
def calcResults(r):
    precision = r['correct'] / (r['correct']+r['incorrect']+r['spurious'])
    recall = r['correct'] / (r['correct']+r['incorrect']+r['missing'])
    f1 = 2 * ((recall * precision)/(recall + precision))
    results['precision'] = precision
    results['recall'] = recall
    results['f1'] = f1 
    print(f'precision = {precision}, recall = {recall}, f1 = {f1}')
    return results

In [9]:
results = evaluate(nlp)
results = calcResults(results)

with open('..\\data\\results\\dates_results,json', 'w') as f:
    json.dump(results, f)


499 227
d8d9c5015c9ceb952052f29e1a27ed1f_openbaar-te-maken-documenten-deel-2 108 


Departementaal vertrouwelijk 
 Crisisteam [1;31m9 [0m[1;31mapril [0m[1;31m2020 [0m, 10:00 uur Notulist : Joyce Corver 
 
 
 Akkoord : Johan Gro _ 
 
 
 
 
 
 
 
 
 
 
 35 . | Versoepeling [1;31m24/7 [0mbeleid aanwezigheid arts/verpleegkundige bij besmette gedetineerden c Afgerond 
 < 70jr , [1;31m23-03-2020 [0m
 . | e e . O O hoeveel justitiabelen er Afgerond 
 per plek aanwezig zijn , zodat het mogelijk wordt naar rato beschermingsmiddelen te [1;31m23-03-2020 [0m
 leveren . 
 37 . | e e t de procedure uit hoe het werkt met het verkrijgen van bijstand van Afgerond 
 Defensie . [1;31m23-03-2020 [0m
 DGS&B doet de aanvraag . Er is een standaard format voor . 
 3 . | e t u t de factsheets die op intranet staan , t.b.v. justitiabelen naar Afgerond 
 EE n n Zij u caten de factsheets . [1;31m23-03-2020 [0m
 
 
 3 . | e e een voorstel voor het Beraad van komende Afgerond 
 
 
 
 
 
 
 
 
 
 
 


In [10]:
results

{'correct': 424,
 'incorrect': 9,
 'missing': 54,
 'spurious': 13,
 'america': 30,
 'ocr': 19,
 'oneoff': 12,
 'other': 15,
 'documents': 265,
 'precision': 0.9506726457399103,
 'recall': 0.8706365503080082,
 'f1': 0.9088960342979635}

removed all access newlines and spaces

Part of the reason not all dates were caught was an oversight with american date formats. To validate the if a found match is actually a date, it needs a date format to compare the match to. Only the dd-mm-yyyy format was checked and not the american format of mm-dd-yyyy with the month before the year. This means it excludes dates like 04-14-2022 as this cannot be a date in the dd-mm-yyyy format because there obviously isn't a 14th month. In the american system this is just april 14th 2022.

Another reason for not finding some dates is OCR mistakes. For example, in one case the date that was supposed to be found was "5/12/2021" but in the OCR process that string was read as "542/2021" where the "/" and "1" were seen as one character, a 4.

Missed dates in file names were excluded from the evaluation because these are not the dates your looking for

precision = 0.9197860962566845, recall = 0.8911917098445595, f1 = 0.9052631578947369

# Ministries
This next part is the evalutation of the ministries

In [3]:
def getMinisteries():
    page = requests.get('https://nl.wikipedia.org/wiki/Lijst_van_Nederlandse_ministeries')
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("td")[-1]

    results.find_all('a', href = True)
    wikis = {}

    abrr = []
    for item in str(results.find_all('p')[0]).split('\n')[:-1] + str(results.find_all('p')[1]).split('\n')[1:-1]:
        temp = re.findall('(?<=\()(.*?)(?=\))', item)
        if temp == []:
            abrr.append(None)
        elif temp[-1] == 'Nederland':
            abrr.append(None)
            if 'Overzeese Gebiedsdelen' in item:
                abrr.append(None)
        else:
            abrr.append(temp[-1].replace('&amp;', '&'))


    counter = 0
    for ministerie in results.find_all('a')[:12]:
        wikis[ministerie.text] = {'Link': 'https://nl.wikipedia.org' + ministerie['href'], 'Abbriviation' : abrr[counter]}
        counter += 1
    
    minList = list(wikis.keys()) + [wikis[x]['Abbriviation'] for x in wikis.keys()]
    for x in minList:
        temp+=x.replace(',', '').split(' ')
    
    return [x.lower() for x in temp if x != 'en' and x != '']
minList = getMinisteries()

In [60]:
def printHilight(string):
    print('\x1b[1;31m'+string + ' ' +'\x1b[0m', end='')

def printHilightUnderline(string):
    print('\033[4m'+string + ' ' +'\x1b[0m', end='')
    
def showMatchesMinistries(doc, minList):   
    indexOfMatches = []
    for ent in doc.ents:
        for i in range(int(ent.start), int(ent.end)):
            indexOfMatches.append(i)

    indexOfMatches = set(indexOfMatches)

    for token in doc:
        flag = False
        for mini in minList:
            if token.text.lower() == mini:
                flag = True
                break
        
        if token.i in indexOfMatches:
            printHilight(str(token.text))
            
        elif flag:
            printHilightUnderline(str(token.text))
            
        else:
            print(token, end=' ')
    
    return
    

In [61]:
def inputHandling(message):
    while(True):
        i = input(message)
        if i == 'q':
            return -1
        
        elif i == '':
            return 0
        
        try:
            i = int(i)
            return i
        
        except:
            print("input number, exit or nothing")

def evaluateMinistries(nlp, minList):
    # cor = exact match, inc = match is wrong (wrong bounds or label), mis = missing match, spu = found something that
    # isnt a mactch, 
    results = {
        'correct':0,
        'incorrect':0,
        'missing':0,
        'spurious':0,
        'partial':0
    }
    
    minList = set(minList)
    while(True):
        try:
            sample = df.sample(1)
            text = sample.text.values[0]
            text = re.sub('\n+', '\n', text)
            text = re.sub(' +', ' ', text)
            for m in minList:
                if m in text.split(' '):
                    print(m, 'AHHHHHHHHHHHHHHHH')
                    break
            else:
                continue
                
            print(sum(results.values()))
            print(sample.name.values[0], sample.page.values[0], '\n\n')
            doc = nlp(text)
            
            showMatchesMinistries(doc, minList)

            for case in results.keys():
                result = inputHandling(case)
                if result == -1:
                    return results
                else:
                    results[case] += result

            clear_output()
        except:
            continue
            
        

In [62]:
def calcResults(r):
    precision = (r['correct'] + (0.5 * r['partial'])) / (r['correct']+r['incorrect']+r['spurious'])
    recall = (r['correct'] + (0.5 * r['partial'])) / (r['correct']+r['incorrect']+r['missing'])
    f1 = 2 * ((recall * precision)/(recall + precision))
    print(f'precision = {precision}, recall = {recall}, f1 = {f1}')

In [63]:
results = evaluateMinistries(nlp_ministries, minList)
calcResults(results)

zaken AHHHHHHHHHHHHHHHH
96
6762214604a58986abf9cc852b4202e9_bijlagen-deel-7-bij-besluit-wob-verzoek-over-covid-19 59 


529309 
 
 
 | Crisiscoördinator DCC [1;31mVWS [0m| 
 Ministerie van Volksgezondheld , [4mWelzijn [0men [4mSport [0m| Directie Publieke Gezondheid | 
 Afdeling Crisisbeheersing en Infectieziekten | Etage : 8 flex 
 102 _ G 
 Parnassusplein 5| 2511 VX | Den Haag | 
 Postbus 20350 | 2500 EJ | Den Haag 
 
 
 
 Van : NCC - NCTV 
 Verzonden : woensdag 29 januari 2020 14:54:01 ( UTC+01:00 ) Amsterdam , Berlijn , Bern , Rome , Stockholm , Wenen 
 Onderwerp : RECTIFICATIE uitnodiging IAO maandag 3 februari om 14.00 uur met betrekking tot de stand van [4mzaken [0mrondom de 
 uitbraak van het Corona virus in China 
 RECTIFICATIE : 14:00 uur i.p.v. 13.30 uur 
 Geachte heer/mevrouw , 
 Op verzoek van [1;31mVWS [0men NCTV nodig ik u uit voor een Interdepartementaal Afstemmings Overleg ( IAO ) op maandag 3 
 februari om 14.00 uur met betrekking tot de stand van [4mzaken 

In [41]:
t = nlp_ministries('het Ministerie is een beetje dom')
tl = ['ministerie']
for w in t:
    print(str(w.text).lower() in tl)

False
True
False
False
False
False


In [170]:
minList

['wvc',
 'algemene',
 'zaken',
 'binnenlandse',
 'zaken',
 'koninkrijksrelaties',
 'buitenlandse',
 'zaken',
 'defensie',
 'economische',
 'zaken',
 'klimaat',
 'financiën',
 'infrastructuur',
 'waterstaat',
 'justitie',
 'veiligheid',
 'landbouw',
 'natuur',
 'voedselkwaliteit',
 'onderwijs',
 'cultuur',
 'wetenschap',
 'sociale',
 'zaken',
 'werkgelegenheid',
 'volksgezondheid',
 'welzijn',
 'sport',
 'az',
 'bzk',
 'bz',
 'def',
 'ez',
 'fin',
 'i&w',
 'j&v',
 'lnv',
 'ocw',
 'szw',
 'vws']

Missing things in situations like this: RVO/LNV. LNV should be have been caught here. Places with a lot of extra newlines or spaces within the name of a ministry will also trip up the model.

# SpaCy

This next part is the spacy analysis

In [4]:
def loadGroundTruth():
    with open('..\\data\\ner labeled data\\test.conllu', 'r', encoding='utf8') as f:
        ground = f.read()
        ground = ground.split('\n')
        ground = [x.split('\t') for x in ground]
    
    text = []
    for word in ground:
        try:
            text.append(word[1])
        except IndexError:
            text.append('\n')
    

    text = ' '.join(text)
    return text, ground


In [5]:
# this function makes sure that an index refers to the same token in both the ground truth and the doc
def sync(ground, doc):
    
    # newlines in ground truth are given as an empty string in a list
    # spacy doesnt do tokens for newlines so these can be removed
    ground = [x for x in ground if x != ['']]
    
    groundIndex, docIndex = 0, 0
    docNew, groundNew = [], []
    
    while True:
        if groundIndex+2 > len(ground) or docIndex+2 > len(doc):
            return groundNew, docNew
        
        
        if str(doc[docIndex].text) != ground[groundIndex][1]:
            # see if next token in ground truth equals current doc token
            if str(doc[docIndex].text) == ground[groundIndex + 1][1]:
                groundIndex +=1
            elif str(doc[docIndex].text) == ground[groundIndex + 2][1]:
                groundIndex +=2

            # see if next token in doc equals current ground truth token
            elif str(doc[docIndex + 1].text) == ground[groundIndex ][1]:
                docIndex+=1
            elif str(doc[docIndex + 2].text) == ground[groundIndex ][1]:
                docIndex+=2


            # checks if doc split a token that ground truth didnt
            elif str(doc[docIndex].text) + str(doc[docIndex + 1].text) == ground[groundIndex ][1]:
                docIndex += 2
                groundIndex += 1

            # checks if ground split a token that doc didnt
            elif str(doc[docIndex].text) == ground[groundIndex ][1] + ground[groundIndex + 1][1]:
                docIndex += 1 
                groundIndex += 2

            # checks if doc split a token that ground truth didnt
            elif str(doc[docIndex].text) + str(doc[docIndex+1].text)+ str(doc[docIndex+2].text) == ground[groundIndex][1]:
                docIndex += 3
                groundIndex += 1

            # checks if ground split a token that doc didnt
            elif str(doc[docIndex].text) == ground[groundIndex][1] + ground[groundIndex+1][1] + ground[groundIndex+2][1]:
                docIndex += 1 
                groundIndex += 3

            else:

                print(str(doc[docIndex].text), str(doc[docIndex+1].text),str(doc[docIndex+2].text)) 
                print(ground[groundIndex][1],ground[groundIndex+1][1],ground[groundIndex+2][1])
                return groundNew, docNew
        
        # add good tokens to new lists
        groundNew.append((ground[groundIndex][1], ground[groundIndex][2]))
        docNew.append((str(doc[docIndex].text), doc[docIndex].ent_type_))
            
        groundIndex += 1
        docIndex += 1

def testSync(ground, doc):
    for i in range(len(doc)):
        if doc[i][0] != ground[i][0]:
            return False
    return True

In [6]:
# this gets the span of all entities in ground truth
def getSpans(ground):
    span = []
    i = 0
    flag = False
    begin = 0
    
    while i < len(ground):
        
        # base case: no continuation of entiy or start of entity
        if ground[i][1] == 'O' and flag == False:
            pass
        
        # end of entity span no new entity, reset flag and add entity
        elif ground[i][1] == 'O' and flag == True:
            span.append(((begin, i), ground[begin][1]))
            flag = False
        
        # end of entity span, new entity starts. Reset flag, add entity, and start new entity
        elif ground[i][1][0] == 'B' and flag == True:
            span.append(((begin, i), ground[begin][1]))
            begin = i
        
        # start of new entity, set flag and begin
        elif ground[i][1][0] == 'B' and flag == False:
            flag = True
            begin = i
        i+=1
        
    
    return span

In [40]:
# save predictions to csv
def savePredictions(predictions):
    begin = [x[0][0] for x in predictions]
    end = [x[0][1] for x in predictions]
    entType = [x[1] for x in predictions]
    ent = [x[2] for x in predictions]
    pred = pd.DataFrame.from_dict({'begin':begin, 'end':end, 'entType':entType, 'ent':ent})
    pred.to_csv('..\\data\\predictions.csv')

# get entities found by spacy in correct format
def getPredictionSpans(doc, tokens):
    spans = []
    
    i = 0
    entityIndex = 0
    
    # check for every entity
    while i < len(tokens):
        try:
            
            # if current token has an entity label
            if tokens[i][1] != '':
                
                # get entity type and string representation
                entity = doc.ents[entityIndex]
                entityType = doc[entity.start].ent_type_
                entity = str(entity)
                
                # get number of tokens in entity and add to list
                nTokens = len(entity.split(' '))
                spans.append(((i, i + nTokens), entityType, entity))
                
                # increase token index by number of tokens in current entity
                i += nTokens
                
                # set entity index to next
                entityIndex += 1

            else:
                i += 1
        except IndexError:
            savePredictions(spans)
            return
        
    
    savePredictions(spans)
    return

In [41]:
# load predictions from csv
def getPredictions():
    predictions = pd.read_csv('..\\data\\predictions.csv')
    begin = list(predictions.begin)
    end = list(predictions.end)
    entType = list(predictions.entType)
    ent = list(predictions.ent)
    
    predictionsList = []
    for i in range(len(begin)):
        predictionsList.append(((begin[i], end[i]), entType[i], ent[i]))
        
    return predictionsList

In [98]:
# text = full text as string
# groundOld = tokens with ground truth labels
# doc = spacy doc of full text
# ground = list of tokens with ground truth labels in sync with docList
# docList = list of tokens from spacy with predicted labels in sync with ground
# predictions = list of predicted entities by spacy with begin, end, type and text
# span = list of ground truth entites with begin, end, type


text, groundOld = loadGroundTruth()
doc = nlp(text[:1000000])
ground, docList = sync(groundOld, doc)
predictions = getPredictions()
span = getSpans(ground)

In [102]:
len(groundOld)
len(docList)

170685

## SpaCy eval normal (Doenst work, is not used)
also very bad and doesnt even do what it is supposed to

In [88]:
def calcF1(tp, tn, fp, fn):
    print(tp, tn, fp, fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * ((recall * precision) / (recall + precision))
    return recall, precision, f1
    

In [112]:
def evalNER(ground, doc):
    
    groundIndex, docIndex = 0, 0
    tp, tn, fp, fn = 0, 0, 0, 0
    nerCats = ['FAC', 'PERSON', 'ORG', 'GPE', 'LOC']
    groundCats = ['ORG', 'LOC', 'PER']
    
    while True:
        try:
            if str(doc[docIndex].text) != ground[groundIndex][1]:
                
                # checks for if the words are still in sync
                if str(doc[docIndex + 1].text) != ground[groundIndex][1]:
                    docIndex += 1
                    continue
                
                elif str(doc[docIndex].text) != ground[groundIndex + 1][1]:
                    groundIndex += 1
                    continue
                    
                else:
                    for j in range(docIndex-3,docIndex+3):
                        print(doc[j], ground[j])
                
            else:
                if doc[docIndex].ent_type_ == '' and ground[groundIndex][2] == 'O':
                    tn += 1
                    
                elif doc[docIndex].ent_type_ in nerCats and ground[groundIndex][2] == 'O':
                    fp += 1
                    
                elif doc[docIndex].ent_type_ in nerCats and ground[groundIndex][2] != 'O':
                    tp += 1
                    
                elif doc[docIndex].ent_type_ == '' and ground[groundIndex][2] != 'O':
                    fn += 1
        
        except IndexError:
            pass
        
        groundIndex +=1
        docIndex +=1
        
        if groundIndex > len(ground) and docIndex > len(doc):
            print('good')
            return calcF1(tp, tn, fp, fn)

In [113]:
recall, precision, f1 = evalNER(ground, doc)
print('recall', recall)
print('precision', precision)
print('f1', f1)

good
1240 10349 47 185
recall 0.8701754385964913
precision 0.9634809634809635
f1 0.9144542772861357


## SpaCy eval strict

In [49]:
def calcF1Strict(cor, inc, spu, mis):
    print(cor,inc,spu,mis)
    recall = cor / (cor+inc+mis)
    precision = cor / (cor+inc+spu)
    f1 = 2 * ((recall * precision) / (recall + precision))
    print('recall', recall)
    print('precision', precision)
    print('f1', f1)
    results = {'total':cor + inc + spu + mis,
              'correct':cor,
              'incorrect':inc,
              'missing':mis,
              'spurious':spu,
              'precision':precision,
              'recall':recall,
              'f1':f1}
    
    return results

In [103]:
def evalNER(ground, pred, method):
    correct, incorrect, spurious, missing = 0, 0, 0, 0
    
    spacyBanList = ['CARDINAL','DATE','LAW','MONEY','ORDINAL','PERCENT','QUANTITY','TIME']
    cats = {'B-ORG': ['ORG'],
           'B-PER': ['PERSON'],
           'B-LOC': ['FAC', 'GPE', 'LOC'],
           'B-ORG': ['ORG']}
    
    groundIndex = 0
    predIndex = 0
    
    while True:
        if groundIndex >= len(ground) or predIndex >= len(pred):
            return calcF1Strict(correct, incorrect, spurious, missing)
        
        # set current tokens 
        groundEnt = ground[groundIndex]
        predEnt = pred[predIndex]
        
        # correct span
        if groundEnt[0] == predEnt[0]:
            if method == 'exact':
                correct += 1
            
            else:
                # correct type
                if groundEnt[1] == 'B-MISC':
                    correct += 1

                # also correct type
                elif predEnt[1] in cats[groundEnt[1]]:
                    correct += 1

                # not correct type
                else:
                    incorrect += 1
                
            groundIndex += 1
            predIndex += 1
                
        # no overlap between spans
        elif groundEnt[0][0] > predEnt[0][1]:
            # ground is higher, increase predEnt
            # spurious
            # check ents: some do not count
            if predEnt[1] not in spacyBanList:
                spurious += 1
            
            predIndex += 1

        elif groundEnt[0][1] < predEnt[0][0]:
            # ground is lower, increase ground
            # missing
            groundIndex += 1
            missing += 1
        
        # overlap between spans
        else:
            incorrect += 1
            groundIndex += 1
            predIndex += 1   
        


In [106]:
results = evalNER(span, predictions, 'strict')
with open('..\\data\\results\\ner_strict_results,json', 'w') as f:
    json.dump(results, f)
results

10359 1935 1093 1029
recall 0.7775275838775051
precision 0.7738104130873236
f1 0.7756645451141895


{'total': 14416,
 'correct': 10359,
 'incorrect': 1935,
 'missing': 1029,
 'spurious': 1093,
 'precision': 0.7738104130873236,
 'recall': 0.7775275838775051,
 'f1': 0.7756645451141895}

## SpaCy eval exact

In [107]:
results = evalNER(span, predictions, 'exact')
with open('..\\data\\results\\ner_exact_results,json', 'w') as f:
    json.dump(results, f)
results

10976 1318 1093 1029
recall 0.8238384748179839
precision 0.8198999028908642
f1 0.8218644702358667


{'total': 14416,
 'correct': 10976,
 'incorrect': 1318,
 'missing': 1029,
 'spurious': 1093,
 'precision': 0.8198999028908642,
 'recall': 0.8238384748179839,
 'f1': 0.8218644702358667}

These are really good scores for the model, especially the exact one. But we need to keep in mind that the permorance of a NER model can depend on the sort of text. It could be that the spaCy model was trained on documents that have a very high resemblance to this test set. The performance on the WOB document can be lower than these results suggest.

For the evaluation, spacy has a lot more things it looks for, like monotary values or percentages. These were skipped, so if spacy found an entity that the ground truth didnt have and the label was one of the banned labels, it was not considered a spurious match. The law label was also ignored because that will be done better by hand (hopefully)