# Improving Named Entity Recognition address parsing
Data & AI course, UC Leuven, 2021 Fall
### Project supervisors
- Tom Magerman
- Aimée Lynn Backiel

### Project team (Group 4)
- Karolis Medekša
- Pedro Teixeira Palma Rosa
- Hysa Mello de Alcântara
- Josep Jacob Chetrit Valdepeñas

## Goals
The goal of the assignment is to try and improve the existing solution for parsing addresses using NLP.

# Existing improvements
### The following improvements were already implemented with the first draft of the solution, which, in our opinion, are noteworthy:
- Fixing mistakes in the training dataset by hand (there might still be mistakes in the validation set)
- Fixing conflicting entity errors when training the model
- Pre-parsing the data so that tokenizer can recognize all tokens
- Improving the algorithm to handle overlapping entities
- Fine-tuning the `drop` criteria and iteration count

For more information how these improvements were implemented please consult the `DOCUMENTATION` notebook.

# Evaluating NER model performance by country
One thing to look into with the model is how well does it perform with different regions. Postal codes are different among different countries, moreover, cities and regions can differ greatly among regions.

To conduct the experiment, we first train a baseline model as the result of the initial exercise (refer to `DOCUMENTATION` for more information about it):

In [4]:
import pandas as pd
import re

## Define utility functions:

def read_DataFrame_from_excel(filename: str, numberOfRows: int = None):
    return pd.read_excel(filename, nrows = numberOfRows, keep_default_na=False)


def preprocess_data(data: pd.DataFrame):
    for col in data.columns:
        data[col] = data.apply(lambda row: re.sub(r'([^\s])([,;])([^\s])', r'\1\2 \3', str(row[col])), axis=1)


def entities_overlap(entry):
    entities = entry[1]['entities']
    for first in entities:
        for second in entities:
            if (first == second): continue
            if (first[0] < second[0] and first[1] > second[0]) or (first[0] > second[0] and first[1] < second[0]) or (first[0]==second[0] or first[1]==second[1]):
                print('Entities {} and {} overlap in "{}"'.format(first, second, entry[0]))
                return True
    return False


def get_entity_list(entry: dict, adr: str):
    address = str(adr)
    entities: list = []
    present_tokens = filter(lambda item: item[0] in TOKEN_TYPES and item[1] and str(item[1]).strip(), entry.items())

    ## tokens to retry matching
    retry_tokens: set = set()

    for item in present_tokens:
        token_value = str(item[1]).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            # If multiple occurences can be matched, save the token to be matched later
            if (len(re.findall(re.escape(token_value), address)) > 1):
                retry_tokens.add((token_value, item[0]))
                continue
            span = match.span()
            entities.append((span[0], span[1], item[0]))
            # Replace matched entity with symbols, so that parts of it cannot be matched again
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            # Try and resolve multiple tokens separated by ';'
            split_items = map(lambda token: token.strip(), token_value.split(';'))
            for token in split_items:
                split_match = re.search(re.escape(token), address)
                if split_match:
                    # If multiple occurences can be matched, save the token to be matched later
                    if (len(re.findall(re.escape(token), address)) > 1):
                        retry_tokens.add((token, item[0]))
                        continue
                    span = split_match.span()
                    entities.append((span[0], span[1], item[0]))
                    # Replace matched entity with symbols, so that parts of it cannot be matched again
                    address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
                else:
                    print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))
    
    # Try and match previously marked tokens, now that single-match entities were eliminated
    for token, tkn_type in retry_tokens:
        token_value = str(token).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            span = match.span()
            entities.append((span[0], span[1], tkn_type))
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))

    return entities


def map_to_training_entry(entry: dict):
    address = entry['person_address']
    return (address, {
        'entities': get_entity_list(entry, address)
    })

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

TOKEN_TYPES: set = {'co', 'building', 'street', 'nr', 'area', 'postal', 'city', 'region', 'country'}

raw_data: pd.DataFrame = read_DataFrame_from_excel('../files/training_data_fixed.xlsx', 999)
preprocess_data(raw_data)

train_data = list(
    map(map_to_training_entry, raw_data.to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 799 | test entries: 200


In [13]:
import spacy
import random
from spacy.util import minibatch, compounding

nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 3977.276416558074}
Iteration: 1 | Losses: {'ner': 3580.2742863950175}
Iteration: 2 | Losses: {'ner': 3282.5660284712194}
Iteration: 3 | Losses: {'ner': 3045.0924848197546}
Iteration: 4 | Losses: {'ner': 2925.8011155040053}
Iteration: 5 | Losses: {'ner': 2792.9772251557188}
Iteration: 6 | Losses: {'ner': 2665.614183408595}
Iteration: 7 | Losses: {'ner': 2623.1970171048315}
Iteration: 8 | Losses: {'ner': 2583.3433715226765}
Iteration: 9 | Losses: {'ner': 2490.063479144209}
Iteration: 10 | Losses: {'ner': 2361.784224360041}
Iteration: 11 | Losses: {'ner': 2312.013685919401}
Iteration: 12 | Losses: {'ner': 2199.3338798884456}
Iteration: 13 | Losses: {'ner': 2244.988300194779}
Iteration: 14 | Losses: {'ner': 2119.654617292578}
Iteration: 15 | Losses: {'ner': 2089.4424322877303}
Iteration: 16 | Losses: {'ner': 2079.1763659212284}
Iteration: 17 | Losses: {'ner': 1929.159082558892}
Iteration: 18 | Losses: {'ner': 1862.7613717149006}
Iteration: 19 | Losses: {'ner'

In [16]:
%%capture
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def results_per_entity_to_df(res: dict):
    columns = ['Token', 'Precision', 'Recall', 'F1 score']
    df = pd.DataFrame(columns=columns)
    total = pd.concat(
        [pd.DataFrame([['Total', res['ents_p'], res['ents_r'], res['ents_f']]], columns=columns)]
        , ignore_index=True
    )
    per_entity = pd.concat(
        [pd.DataFrame([
            [token, 
             res['ents_per_type'][token]['p'], 
             res['ents_per_type'][token]['r'], 
             res['ents_per_type'][token]['f']]
        ], columns=columns) for token in TOKEN_TYPES], ignore_index=True
    )
    return pd.concat([per_entity, total], ignore_index=True)


def map_to_evaluation_model(entry: tuple):
    return (entry[0], entry[1]['entities'])


def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

train_results = evaluate(nlp, map(map_to_evaluation_model, train_sample))
test_results = evaluate(nlp, map(map_to_evaluation_model, test_sample))

In [17]:
from IPython.display import display, HTML

print('---- GENERAL: Results on train data ----')
display(HTML(results_per_entity_to_df(train_results).to_html(index=False)))
print('---- GENERAL: Results on test data ----')
display(HTML(results_per_entity_to_df(test_results).to_html(index=False)))

---- GENERAL: Results on train data ----


Token,Precision,Recall,F1 score
country,76.190476,48.484848,59.259259
postal,94.90085,98.820059,96.820809
area,82.386364,65.315315,72.864322
city,89.164491,98.414986,93.561644
co,62.616822,65.048544,63.809524
street,78.009259,86.410256,81.995134
nr,92.164179,85.172414,88.530466
building,62.5,27.027027,37.735849
region,88.050314,93.333333,90.614887
Total,86.130206,87.116564,86.620577


---- GENERAL: Results on test data ----


Token,Precision,Recall,F1 score
country,25.0,50.0,33.333333
postal,92.391304,92.391304,92.391304
area,41.463415,28.813559,34.0
city,71.090047,84.745763,77.319588
co,47.826087,45.833333,46.808511
street,62.264151,68.041237,65.024631
nr,84.615385,81.481481,83.018868
building,16.666667,10.0,12.5
region,72.289157,77.922078,75.0
Total,70.962733,73.828756,72.367379


The precision/recall of attributes postal, city, street and house number are the most important, so we'll look into optimizing them. First, let's check how many each country's addresses are there:

In [33]:
raw_data.groupby(['person_ctry_code']).size().sort_values(ascending=False).head(10)

person_ctry_code
US    347
JP    198
DE     89
FR     48
KR     43
GB     37
CN     27
TW     18
IT     16
CA     16
dtype: int64

We can also check how accurate the predictions are with each country's addresses, including both seen and unseen data:

In [52]:
def check_correctness_by_country(country_code: str, frame: pd.DataFrame):
    filtered = frame[frame['person_ctry_code'] == country_code]
    mapped = list(
        map(map_to_training_entry, filtered.to_dict('records'))
    )
    results = evaluate(nlp, map(map_to_evaluation_model, mapped))
    print('---- Results on {} addresses ----'.format(country_code))
    display(HTML(results_per_entity_to_df(results).to_html(index=False)))

In [53]:
check_correctness_by_country('US', raw_data)

---- Results on US addresses ----


  gold = GoldParse(doc_gold_text, entities=annot)


Token,Precision,Recall,F1 score
country,80.0,80.0,80.0
postal,95.967742,100.0,97.942387
area,80.645161,40.983607,54.347826
city,89.329268,97.019868,93.015873
co,63.636364,70.0,66.666667
street,85.185185,94.262295,89.494163
nr,98.26087,93.38843,95.762712
building,75.0,42.857143,54.545455
region,95.416667,98.283262,96.828753
Total,91.374122,92.299899,91.834677


In [54]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
country,100.0,33.333333,50.0
postal,85.0,93.150685,88.888889
area,75.0,72.0,73.469388
city,77.0,89.534884,82.795699
co,60.869565,60.869565,60.869565
street,54.615385,62.280702,58.196721
nr,33.333333,20.0,25.0
building,37.5,15.0,21.428571
region,69.354839,70.491803,69.918699
Total,69.277108,71.65109,70.444104


In [55]:
check_correctness_by_country('DE', raw_data)

---- Results on DE addresses ----


Token,Precision,Recall,F1 score
country,50.0,20.0,28.571429
postal,98.0,100.0,98.989899
area,66.666667,57.142857,61.538462
city,92.857143,96.296296,94.545455
co,60.0,100.0,75.0
street,95.744681,88.235294,91.836735
nr,91.836735,91.836735,91.836735
building,0.0,0.0,0.0
region,20.0,100.0,33.333333
Total,91.129032,91.497976,91.313131


Training the model without Japanese addresses:

In [60]:
train_data = list(
    map(map_to_training_entry, raw_data[raw_data['person_ctry_code'] != 'JP'].to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 640 | test entries: 161


In [61]:
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 2850.7694831148256}
Iteration: 1 | Losses: {'ner': 2337.574111491176}
Iteration: 2 | Losses: {'ner': 2102.10072309518}
Iteration: 3 | Losses: {'ner': 2017.526945453805}
Iteration: 4 | Losses: {'ner': 1961.4871999486684}
Iteration: 5 | Losses: {'ner': 1925.7118151357736}
Iteration: 6 | Losses: {'ner': 1802.8459183709767}
Iteration: 7 | Losses: {'ner': 1725.1936288149025}
Iteration: 8 | Losses: {'ner': 1640.6068277296922}
Iteration: 9 | Losses: {'ner': 1646.875747922958}
Iteration: 10 | Losses: {'ner': 1653.0190410545306}
Iteration: 11 | Losses: {'ner': 1544.3840787018908}
Iteration: 12 | Losses: {'ner': 1507.046106418851}
Iteration: 13 | Losses: {'ner': 1433.7361705189355}
Iteration: 14 | Losses: {'ner': 1476.5068128975824}
Iteration: 15 | Losses: {'ner': 1417.6282565029799}
Iteration: 16 | Losses: {'ner': 1358.5674984634775}
Iteration: 17 | Losses: {'ner': 1301.4037850109555}
Iteration: 18 | Losses: {'ner': 1303.6894518329257}
Iteration: 19 | Losses: {'ne

In [63]:
train_results = evaluate(nlp, map(map_to_evaluation_model, train_sample))
test_results = evaluate(nlp, map(map_to_evaluation_model, test_sample))

print('---- Without JAPAN: Results on train data ----')
display(HTML(results_per_entity_to_df(train_results).to_html(index=False)))
print('---- Without JAPAN: Results on test data ----')
display(HTML(results_per_entity_to_df(test_results).to_html(index=False)))

  gold = GoldParse(doc_gold_text, entities=annot)


---- Without JAPAN: Results on train data ----


Token,Precision,Recall,F1 score
country,80.952381,60.714286,69.387755
postal,97.250859,98.263889,97.75475
area,87.096774,55.102041,67.5
city,91.07438,98.392857,94.592275
co,66.101695,73.584906,69.642857
street,85.119048,94.389439,89.514867
nr,96.168582,92.279412,94.183865
building,55.555556,36.363636,43.956044
region,90.650407,90.283401,90.466531
Total,89.887064,89.656938,89.771853


---- Without JAPAN: Results on test data ----


Token,Precision,Recall,F1 score
country,25.0,25.0,25.0
postal,94.029851,90.0,91.970803
area,37.5,17.647059,24.0
city,77.21519,87.769784,82.154882
co,0.0,0.0,0.0
street,62.244898,87.142857,72.619048
nr,77.631579,85.507246,81.37931
building,0.0,0.0,0.0
region,86.666667,75.362319,80.620155
Total,74.743326,77.61194,76.150628


In [64]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
country,33.333333,33.333333,33.333333
postal,79.012346,87.671233,83.116883
area,29.62963,16.0,20.779221
city,50.367647,79.651163,61.711712
co,49.122807,40.57971,44.444444
street,26.277372,31.578947,28.685259
nr,21.621622,53.333333,30.769231
building,0.0,0.0,0.0
region,14.285714,1.639344,2.941176
Total,43.145743,46.573209,44.794007


Only Japanese addresses:

In [65]:
train_data = list(
    map(map_to_training_entry, raw_data[raw_data['person_ctry_code'] == 'JP'].to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 158 | test entries: 40


In [70]:
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 1296.288991034031}
Iteration: 1 | Losses: {'ner': 1019.4307759591611}
Iteration: 2 | Losses: {'ner': 1222.0193906500936}
Iteration: 3 | Losses: {'ner': 1190.3264100253582}
Iteration: 4 | Losses: {'ner': 1155.5151634812355}
Iteration: 5 | Losses: {'ner': 1061.6250290572643}
Iteration: 6 | Losses: {'ner': 1034.111961901188}
Iteration: 7 | Losses: {'ner': 926.5234970450401}
Iteration: 8 | Losses: {'ner': 908.6183833181858}
Iteration: 9 | Losses: {'ner': 866.6283576494316}
Iteration: 10 | Losses: {'ner': 817.3716110667738}
Iteration: 11 | Losses: {'ner': 794.978878598311}
Iteration: 12 | Losses: {'ner': 855.1516721062071}
Iteration: 13 | Losses: {'ner': 792.2007579095662}
Iteration: 14 | Losses: {'ner': 804.8770619569113}
Iteration: 15 | Losses: {'ner': 717.5790040045977}
Iteration: 16 | Losses: {'ner': 745.0605421150103}
Iteration: 17 | Losses: {'ner': 754.5750753134489}
Iteration: 18 | Losses: {'ner': 740.428767343983}
Iteration: 19 | Losses: {'ner': 698.73

In [71]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
country,100.0,33.333333,50.0
postal,86.486486,87.671233,87.07483
area,61.46789,67.0,64.114833
city,72.300469,89.534884,80.0
co,55.223881,53.623188,54.411765
street,54.700855,56.140351,55.411255
nr,66.666667,13.333333,22.222222
building,87.5,35.0,50.0
region,83.333333,40.983607,54.945055
Total,67.68,65.88785,66.771902
