In [4]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

from IPython.display import Markdown, display
from IPython.display import clear_output

import os
import re
import json

In [5]:
spacy.prefer_gpu()
nlp = spacy.load("nl_core_news_lg")

This notebook is about testing the accuracy of the spaCy NER model. This was done by using a pre-exisiting labeled dataset. The code in this notebook is to answer 2.1.


The first part is all of the data handeling. It took quite some effort to get the data in the correct format to actually be  able to calculate the performance. 

In [6]:
def loadGroundTruth():
    with open('..\\data\\ner labeled data\\test.conllu', 'r', encoding='utf8') as f:
        ground = f.read()
        ground = ground.split('\n')
        ground = [x.split('\t') for x in ground]
    
    text = []
    for word in ground:
        try:
            text.append(word[1])
        except IndexError:
            text.append('\n')
    

    text = ' '.join(text)
    return text, ground


In [7]:
# this function makes sure that an index refers to the same token in both the ground truth and the doc
# its not a robust approach but it works
def sync(ground, doc):
    
    # newlines in ground truth are given as an empty string in a list
    # spacy doesnt do tokens for newlines so these can be removed
    ground = [x for x in ground if x != ['']]
    
    groundIndex, docIndex = 0, 0
    docNew, groundNew = [], []
    
    while True:
        if groundIndex+2 > len(ground) or docIndex+2 > len(doc):
            return groundNew, docNew
        
        
        if str(doc[docIndex].text) != ground[groundIndex][1]:
            # see if next token in ground truth equals current doc token
            if str(doc[docIndex].text) == ground[groundIndex + 1][1]:
                groundIndex +=1
            elif str(doc[docIndex].text) == ground[groundIndex + 2][1]:
                groundIndex +=2

            # see if next token in doc equals current ground truth token
            elif str(doc[docIndex + 1].text) == ground[groundIndex ][1]:
                docIndex+=1
            elif str(doc[docIndex + 2].text) == ground[groundIndex ][1]:
                docIndex+=2


            # checks if doc split a token that ground truth didnt
            elif str(doc[docIndex].text) + str(doc[docIndex + 1].text) == ground[groundIndex ][1]:
                docIndex += 2
                groundIndex += 1

            # checks if ground split a token that doc didnt
            elif str(doc[docIndex].text) == ground[groundIndex ][1] + ground[groundIndex + 1][1]:
                docIndex += 1 
                groundIndex += 2

            # checks if doc split a token that ground truth didnt
            elif str(doc[docIndex].text) + str(doc[docIndex+1].text)+ str(doc[docIndex+2].text) == ground[groundIndex][1]:
                docIndex += 3
                groundIndex += 1

            # checks if ground split a token that doc didnt
            elif str(doc[docIndex].text) == ground[groundIndex][1] + ground[groundIndex+1][1] + ground[groundIndex+2][1]:
                docIndex += 1 
                groundIndex += 3

            else:

                print(str(doc[docIndex].text), str(doc[docIndex+1].text),str(doc[docIndex+2].text)) 
                print(ground[groundIndex][1],ground[groundIndex+1][1],ground[groundIndex+2][1])
                return groundNew, docNew
        
        # add good tokens to new lists
        groundNew.append((ground[groundIndex][1], ground[groundIndex][2]))
        docNew.append((str(doc[docIndex].text), doc[docIndex].ent_type_))
            
        groundIndex += 1
        docIndex += 1

def testSync(ground, doc):
    for i in range(len(doc)):
        if doc[i][0] != ground[i][0]:
            return False
    return True

In [8]:
# this gets the span of all entities in ground truth
def getSpans(ground):
    span = []
    i = 0
    flag = False
    begin = 0
    
    while i < len(ground):
        
        # base case: no continuation of entiy or start of entity
        if ground[i][1] == 'O' and flag == False:
            pass
        
        # end of entity span no new entity, reset flag and add entity
        elif ground[i][1] == 'O' and flag == True:
            span.append(((begin, i), ground[begin][1]))
            flag = False
        
        # end of entity span, new entity starts. Reset flag, add entity, and start new entity
        elif ground[i][1][0] == 'B' and flag == True:
            span.append(((begin, i), ground[begin][1]))
            begin = i
        
        # start of new entity, set flag and begin
        elif ground[i][1][0] == 'B' and flag == False:
            flag = True
            begin = i
        i+=1
        
    
    return span

In [9]:
# save predictions to csv
def savePredictions(predictions):
    begin = [x[0][0] for x in predictions]
    end = [x[0][1] for x in predictions]
    entType = [x[1] for x in predictions]
    ent = [x[2] for x in predictions]
    pred = pd.DataFrame.from_dict({'begin':begin, 'end':end, 'entType':entType, 'ent':ent})
    pred.to_csv('..\\data\\predictions.csv')

# get entities found by spacy in correct format
def getPredictionSpans(doc, tokens):
    spans = []
    
    i = 0
    entityIndex = 0
    
    # check for every entity
    while i < len(tokens):
        try:
            
            # if current token has an entity label
            if tokens[i][1] != '':
                
                # get entity type and string representation
                entity = doc.ents[entityIndex]
                entityType = doc[entity.start].ent_type_
                entity = str(entity)
                
                # get number of tokens in entity and add to list
                nTokens = len(entity.split(' '))
                spans.append(((i, i + nTokens), entityType, entity))
                
                # increase token index by number of tokens in current entity
                i += nTokens
                
                # set entity index to next
                entityIndex += 1

            else:
                i += 1
        except IndexError:
            savePredictions(spans)
            return
        
    
    savePredictions(spans)
    return

In [10]:
# load predictions from csv
def getPredictions():
    predictions = pd.read_csv('..\\data\\predictions.csv')
    begin = list(predictions.begin)
    end = list(predictions.end)
    entType = list(predictions.entType)
    ent = list(predictions.ent)
    
    predictionsList = []
    for i in range(len(begin)):
        predictionsList.append(((begin[i], end[i]), entType[i], ent[i]))
        
    return predictionsList

In [11]:
# text = full text as string
# groundOld = tokens with ground truth labels
# doc = spacy doc of full text
# ground = list of tokens with ground truth labels in sync with docList
# docList = list of tokens from spacy with predicted labels in sync with ground
# predictions = list of predicted entities by spacy with begin, end, type and text
# span = list of ground truth entites with begin, end, type


text, groundOld = loadGroundTruth()
doc = nlp(text[:1000000])
ground, docList = sync(groundOld, doc)
predictions = getPredictions()
span = getSpans(ground)

NameError: name 'pd' is not defined

The last 4 cells contian the code that does the actual evaluation. 

In [None]:
def calcF1Strict(cor, inc, spu, mis):
    print(cor,inc,spu,mis)
    recall = cor / (cor+inc+mis)
    precision = cor / (cor+inc+spu)
    f1 = 2 * ((recall * precision) / (recall + precision))
    print('recall', recall)
    print('precision', precision)
    print('f1', f1)
    results = {'total':cor + inc + spu + mis,
              'correct':cor,
              'incorrect':inc,
              'missing':mis,
              'spurious':spu,
              'precision':precision,
              'recall':recall,
              'f1':f1}
    
    return results

In [None]:
def evalNER(ground, pred, method):
    correct, incorrect, spurious, missing = 0, 0, 0, 0
    
    spacyBanList = ['CARDINAL','DATE','LAW','MONEY','ORDINAL','PERCENT','QUANTITY','TIME']
    cats = {'B-ORG': ['ORG'],
           'B-PER': ['PERSON'],
           'B-LOC': ['FAC', 'GPE', 'LOC'],
           'B-ORG': ['ORG']}
    
    groundIndex = 0
    predIndex = 0
    
    while True:
        if groundIndex >= len(ground) or predIndex >= len(pred):
            return calcF1Strict(correct, incorrect, spurious, missing)
        
        # set current tokens 
        groundEnt = ground[groundIndex]
        predEnt = pred[predIndex]
        
        # correct span
        if groundEnt[0] == predEnt[0]:
            if method == 'exact':
                correct += 1
            
            else:
                # correct type
                if groundEnt[1] == 'B-MISC':
                    correct += 1

                # also correct type
                elif predEnt[1] in cats[groundEnt[1]]:
                    correct += 1

                # not correct type
                else:
                    incorrect += 1
                
            groundIndex += 1
            predIndex += 1
                
        # no overlap between spans
        elif groundEnt[0][0] > predEnt[0][1]:
            # ground is higher, increase predEnt
            # spurious
            # check ents: some do not count
            if predEnt[1] not in spacyBanList:
                spurious += 1
            
            predIndex += 1

        elif groundEnt[0][1] < predEnt[0][0]:
            # ground is lower, increase ground
            # missing
            groundIndex += 1
            missing += 1
        
        # overlap between spans
        else:
            incorrect += 1
            groundIndex += 1
            predIndex += 1   
        


## Strict evaluation

In [None]:
results = evalNER(span, predictions, 'strict')
with open('..\\data\\results\\ner_strict_results,json', 'w') as f:
    json.dump(results, f)
results

## Excact evaluation

In [None]:
results = evalNER(span, predictions, 'exact')
with open('..\\data\\results\\ner_exact_results,json', 'w') as f:
    json.dump(results, f)
results

## Conclusion

These are really good scores for the model, especially the exact one. But we need to keep in mind that the permorance of a NER model can depend on the sort of text. It could be that the spaCy model was trained on documents that have a very high resemblance to this test set. The performance on the WOB document can be lower than these results suggest.

For the evaluation, spacy has a lot more things it looks for, like monotary values or percentages. These were skipped, so if spacy found an entity that the ground truth didnt have and the label was one of the banned labels, it was not considered a spurious match. The law label was also ignored because that will be done better by hand (hopefully)

There are also some limitations concerning the language. The spaCy model uses the dutch language pack as most docs are in dutch, but there are some documents in other languages, mostly english.