# Improving Named Entity Recognition address parsing
Data & AI course, UC Leuven, 2021 Fall
### Project supervisors
- Tom Magerman
- Aimée Lynn Backiel

### Project team (Group 4)
- Karolis Medekša
- Pedro Teixeira Palma Rosa
- Hysa Mello de Alcântara
- Josep Jacob Chetrit Valdepeñas

## Goals
The goal of the assignment is to try and improve the existing solution for parsing addresses using NLP.

# Existing improvements
### The following improvements were already implemented with the first draft of the solution, which, in our opinion, are noteworthy:
- Fixing mistakes in the training dataset by hand (there might still be mistakes in the validation set)
- Fixing conflicting entity errors when training the model
- Pre-parsing the data so that tokenizer can recognize all tokens
- Improving the algorithm to handle overlapping entities
- Fine-tuning the `drop` criteria and iteration count

For more information how these improvements were implemented please consult the `DOCUMENTATION` notebook.

# Evaluating NER model performance by country
One thing to look into with the model is how well does it perform with different regions. Postal codes are different among different countries, moreover, cities and regions can differ greatly among regions.

To conduct the experiment, we first train a baseline model as the result of the initial exercise (refer to `DOCUMENTATION` for more information about it):

In [41]:
import pandas as pd
import re

## Define utility functions:

def read_DataFrame_from_excel(filename: str, numberOfRows: int = None):
    return pd.read_excel(filename, nrows = numberOfRows, keep_default_na=False)


def preprocess_data(data: pd.DataFrame):
    for col in data.columns:
        data[col] = data.apply(lambda row: re.sub(r'([^\s])([,;])([^\s])', r'\1\2 \3', str(row[col])), axis=1)


def entities_overlap(entry):
    entities = entry[1]['entities']
    for first in entities:
        for second in entities:
            if (first == second): continue
            if (first[0] < second[0] and first[1] > second[0]) or (first[0] > second[0] and first[1] < second[0]) or (first[0]==second[0] or first[1]==second[1]):
                print('Entities {} and {} overlap in "{}"'.format(first, second, entry[0]))
                return True
    return False


def get_entity_list(entry: dict, adr: str):
    address = str(adr)
    entities: list = []
    present_tokens = filter(lambda item: item[0] in TOKEN_TYPES and item[1] and str(item[1]).strip(), entry.items())

    ## tokens to retry matching
    retry_tokens: set = set()

    for item in present_tokens:
        token_value = str(item[1]).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            # If multiple occurences can be matched, save the token to be matched later
            if (len(re.findall(re.escape(token_value), address)) > 1):
                retry_tokens.add((token_value, item[0]))
                continue
            span = match.span()
            entities.append((span[0], span[1], item[0]))
            # Replace matched entity with symbols, so that parts of it cannot be matched again
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            # Try and resolve multiple tokens separated by ';'
            split_items = map(lambda token: token.strip(), token_value.split(';'))
            for token in split_items:
                split_match = re.search(re.escape(token), address)
                if split_match:
                    # If multiple occurences can be matched, save the token to be matched later
                    if (len(re.findall(re.escape(token), address)) > 1):
                        retry_tokens.add((token, item[0]))
                        continue
                    span = split_match.span()
                    entities.append((span[0], span[1], item[0]))
                    # Replace matched entity with symbols, so that parts of it cannot be matched again
                    address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
                else:
                    print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))
    
    # Try and match previously marked tokens, now that single-match entities were eliminated
    for token, tkn_type in retry_tokens:
        token_value = str(token).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            span = match.span()
            entities.append((span[0], span[1], tkn_type))
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))

    return entities


def map_to_training_entry(entry: dict):
    address = entry['person_address']
    return (address, {
        'entities': get_entity_list(entry, address)
    })

In [42]:
from sklearn.model_selection import train_test_split
import numpy as np

TOKEN_TYPES: set = {'co', 'building', 'street', 'nr', 'area', 'postal', 'city', 'region', 'country'}

raw_data: pd.DataFrame = read_DataFrame_from_excel('../files/training_data_fixed.xlsx', 999)
preprocess_data(raw_data)

train_data = list(
    map(map_to_training_entry, raw_data.to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 799 | test entries: 200


In [43]:
import spacy
import random
from spacy.util import minibatch, compounding

nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 4065.3456535532605}
Iteration: 1 | Losses: {'ner': 3720.6703632468416}
Iteration: 2 | Losses: {'ner': 3326.003106552136}
Iteration: 3 | Losses: {'ner': 3094.0743300091885}
Iteration: 4 | Losses: {'ner': 2985.8272367392465}
Iteration: 5 | Losses: {'ner': 2818.5104166490983}
Iteration: 6 | Losses: {'ner': 2711.519315949423}
Iteration: 7 | Losses: {'ner': 2604.7531162635523}
Iteration: 8 | Losses: {'ner': 2529.309734144115}
Iteration: 9 | Losses: {'ner': 2452.243666136069}
Iteration: 10 | Losses: {'ner': 2407.7455834547472}
Iteration: 11 | Losses: {'ner': 2294.013091599905}
Iteration: 12 | Losses: {'ner': 2285.2908805757506}
Iteration: 13 | Losses: {'ner': 2223.6780533953665}
Iteration: 14 | Losses: {'ner': 2133.471455242254}
Iteration: 15 | Losses: {'ner': 2049.3649079094075}
Iteration: 16 | Losses: {'ner': 2036.24709377704}
Iteration: 17 | Losses: {'ner': 1922.1303416627566}
Iteration: 18 | Losses: {'ner': 1776.243404672808}
Iteration: 19 | Losses: {'ner':

In [44]:
%%capture
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def results_per_entity_to_df(res: dict):
    columns = ['Token', 'Precision', 'Recall', 'F1 score']
    df = pd.DataFrame(columns=columns)
    total = pd.concat(
        [pd.DataFrame([['Total', res['ents_p'], res['ents_r'], res['ents_f']]], columns=columns)]
        , ignore_index=True
    )
    per_entity = pd.concat(
        [pd.DataFrame([
            [token, 
             res['ents_per_type'][token]['p'], 
             res['ents_per_type'][token]['r'], 
             res['ents_per_type'][token]['f']]
        ], columns=columns) for token in TOKEN_TYPES], ignore_index=True
    )
    return pd.concat([per_entity, total], ignore_index=True)


def map_to_evaluation_model(entry: tuple):
    return (entry[0], entry[1]['entities'])


def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

train_results = evaluate(nlp, map(map_to_evaluation_model, train_sample))
test_results = evaluate(nlp, map(map_to_evaluation_model, test_sample))

With that we can conclude values on Precision, Recall and F1 score, that are quite high, considering what would be without the improvements we already made.

In [45]:
from IPython.display import display, HTML

print('---- GENERAL: Results on train data ----')
display(HTML(results_per_entity_to_df(train_results).to_html(index=False)))
print('---- GENERAL: Results on test data ----')
display(HTML(results_per_entity_to_df(test_results).to_html(index=False)))

---- GENERAL: Results on train data ----


Token,Precision,Recall,F1 score
area,79.61165,73.873874,76.635514
country,94.117647,48.484848,64.0
city,92.5,95.965418,94.200849
postal,95.170455,98.820059,96.960926
building,67.741935,28.378378,40.0
nr,91.666667,87.241379,89.399293
street,82.435597,90.25641,86.168911
co,53.913043,60.194175,56.880734
region,89.576547,91.666667,90.609555
Total,87.4745,87.689162,87.581699


---- GENERAL: Results on test data ----


Token,Precision,Recall,F1 score
area,39.583333,32.20339,35.514019
country,0.0,0.0,0.0
city,72.330097,84.180791,77.806789
postal,94.444444,92.391304,93.406593
building,0.0,0.0,0.0
nr,83.116883,79.012346,81.012658
street,66.0,68.041237,67.005076
co,37.5,37.5,37.5
region,82.191781,77.922078,80.0
Total,72.089314,73.021002,72.552167


The precision/recall of attributes postal, city, street and house number are the most important, so we'll look into optimizing them. First, let's check how many each country's addresses are there that could have influence on the results:

In [63]:
raw_data.groupby(['person_ctry_code']).size().sort_values(ascending=False).head(10)

person_ctry_code
US    347
JP    198
DE     89
FR     48
KR     43
GB     37
CN     27
TW     18
IT     16
CA     16
dtype: int64

We can also check how accurate the predictions are with each country's addresses, including both seen and unseen data:

In [47]:
def check_correctness_by_country(country_code: str, frame: pd.DataFrame):
    filtered = frame[frame['person_ctry_code'] == country_code]
    mapped = list(
        map(map_to_training_entry, filtered.to_dict('records'))
    )
    results = evaluate(nlp, map(map_to_evaluation_model, mapped))
    print('---- Results on {} addresses ----'.format(country_code))
    display(HTML(results_per_entity_to_df(results).to_html(index=False)))

Now we can check those with more entries, that would make remarkable changes.

In [48]:
check_correctness_by_country('US', raw_data)

---- Results on US addresses ----


  gold = GoldParse(doc_gold_text, entities=annot)


Token,Precision,Recall,F1 score
area,83.783784,50.819672,63.265306
country,66.666667,40.0,50.0
city,90.0,95.364238,92.604502
postal,93.650794,99.159664,96.326531
building,70.0,50.0,58.333333
nr,95.762712,93.38843,94.560669
street,89.147287,94.262295,91.633466
co,36.363636,40.0,38.095238
region,97.446809,98.283262,97.863248
Total,91.708797,91.89463,91.801619


As we can see, American addresses have good results, so it's not worth it to change how we treat them.

In [49]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
area,65.789474,75.0,70.093458
country,100.0,33.333333,50.0
city,81.868132,86.627907,84.180791
postal,91.780822,91.780822,91.780822
building,44.444444,20.0,27.586207
nr,50.0,23.333333,31.818182
street,60.902256,71.052632,65.587045
co,59.15493,60.869565,60.0
region,78.571429,72.131148,75.213675
Total,71.975498,73.208723,72.586873


On the other hand, Japanese addresses treatment could be better, so we hold this thought for later.

In [51]:
check_correctness_by_country('DE', raw_data)

---- Results on DE addresses ----


Token,Precision,Recall,F1 score
area,83.333333,71.428571,76.923077
country,100.0,40.0,57.142857
city,96.385542,98.765432,97.560976
postal,96.078431,100.0,98.0
building,0.0,0.0,0.0
nr,94.0,95.918367,94.949495
street,97.916667,92.156863,94.949495
co,50.0,66.666667,57.142857
region,20.0,100.0,33.333333
Total,93.2,94.331984,93.762575


When we get to German addresses, the results come back to top, with good Precision, Recal and F1 score. We will not check the other coutries because, since they have only a few entries (each equals less than 5% of all addresses), they would't make much of a difference on the results.

## Japanese addresses
Now we can take a look on Japanese addresses and try improving them.

First, we train the model without Japanese addresses:

In [52]:
train_data = list(
    map(map_to_training_entry, raw_data[raw_data['person_ctry_code'] != 'JP'].to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 640 | test entries: 161


In [53]:
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 2865.827038628515}
Iteration: 1 | Losses: {'ner': 2340.1719870500783}
Iteration: 2 | Losses: {'ner': 2135.6300733220473}
Iteration: 3 | Losses: {'ner': 2020.9638930069116}
Iteration: 4 | Losses: {'ner': 1933.9983500085134}
Iteration: 5 | Losses: {'ner': 1838.3589959482265}
Iteration: 6 | Losses: {'ner': 1788.712926915654}
Iteration: 7 | Losses: {'ner': 1724.030529388439}
Iteration: 8 | Losses: {'ner': 1726.3081249713505}
Iteration: 9 | Losses: {'ner': 1606.4385549175456}
Iteration: 10 | Losses: {'ner': 1601.9120288613767}
Iteration: 11 | Losses: {'ner': 1540.5574368811526}
Iteration: 12 | Losses: {'ner': 1500.4573989946443}
Iteration: 13 | Losses: {'ner': 1470.2659198276144}
Iteration: 14 | Losses: {'ner': 1392.764609863558}
Iteration: 15 | Losses: {'ner': 1395.2874814510274}
Iteration: 16 | Losses: {'ner': 1302.4535763420422}
Iteration: 17 | Losses: {'ner': 1274.0767997578296}
Iteration: 18 | Losses: {'ner': 1248.2653037697391}
Iteration: 19 | Losses: {'

Now we can see that the overral Precision, Recall and F1 score are increased, meaning that the Japanese addresses do have impact on the model.

In [54]:
train_results = evaluate(nlp, map(map_to_evaluation_model, train_sample))
test_results = evaluate(nlp, map(map_to_evaluation_model, test_sample))

print('---- Without JAPAN: Results on train data ----')
display(HTML(results_per_entity_to_df(train_results).to_html(index=False)))
print('---- Without JAPAN: Results on test data ----')
display(HTML(results_per_entity_to_df(test_results).to_html(index=False)))

  gold = GoldParse(doc_gold_text, entities=annot)


---- Without JAPAN: Results on train data ----


Token,Precision,Recall,F1 score
area,78.873239,76.190476,77.508651
country,80.769231,75.0,77.777778
city,95.087719,96.785714,95.929204
postal,97.60274,98.958333,98.275862
building,55.882353,34.545455,42.696629
nr,90.877193,95.220588,92.998205
street,89.13738,92.079208,90.584416
co,65.517241,71.698113,68.468468
region,91.935484,92.307692,92.121212
Total,90.599593,91.295443,90.946187


---- Without JAPAN: Results on test data ----


Token,Precision,Recall,F1 score
area,38.888889,41.176471,40.0
country,33.333333,25.0,28.571429
city,80.666667,87.05036,83.737024
postal,93.846154,87.142857,90.37037
building,25.0,11.111111,15.384615
nr,77.777778,91.304348,84.0
street,75.308642,87.142857,80.794702
co,0.0,0.0,0.0
region,85.483871,76.811594,80.916031
Total,77.319588,79.957356,78.616352


To prove that, when we check about the Precision, Recall and F1 score on Japanese addresses, we can see that they are quite low, lower than when we trained all together.

In [55]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
area,29.411765,50.0,37.037037
country,100.0,33.333333,50.0
city,57.142857,62.790698,59.833795
postal,74.117647,86.30137,79.746835
building,0.0,0.0,0.0
nr,23.913043,73.333333,36.065574
street,22.727273,13.157895,16.666667
co,43.548387,39.130435,41.221374
region,0.0,0.0,0.0
Total,42.814371,44.548287,43.664122


Now it's time for we to train only Japanese addresses:

In [59]:
train_data = list(
    map(map_to_training_entry, raw_data[raw_data['person_ctry_code'] == 'JP'].to_dict('records'))
)
train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

train_sample, test_sample = train_test_split(
    train_data, test_size = 0.2, random_state = 420
)
print('train entries: {} | test entries: {}'.format(len(train_sample), len(test_sample)))

train entries: 158 | test entries: 40


In [60]:
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in TOKEN_TYPES:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_sample)
    losses = {}

    batches = minibatch(train_sample, size=compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
    print('Iteration: {} | Losses: {}'.format(itn, losses))

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Iteration: 0 | Losses: {'ner': 1284.1569767929614}
Iteration: 1 | Losses: {'ner': 1078.0679613405373}
Iteration: 2 | Losses: {'ner': 1175.3667421340942}
Iteration: 3 | Losses: {'ner': 1129.9756172895432}
Iteration: 4 | Losses: {'ner': 1055.992091074586}
Iteration: 5 | Losses: {'ner': 1000.0957781840116}
Iteration: 6 | Losses: {'ner': 954.5107497198042}
Iteration: 7 | Losses: {'ner': 909.0037981718779}
Iteration: 8 | Losses: {'ner': 937.0342071205378}
Iteration: 9 | Losses: {'ner': 858.214796923101}
Iteration: 10 | Losses: {'ner': 861.7343583619222}
Iteration: 11 | Losses: {'ner': 813.293468308635}
Iteration: 12 | Losses: {'ner': 768.2377245387615}
Iteration: 13 | Losses: {'ner': 795.7788266167045}
Iteration: 14 | Losses: {'ner': 824.7075342992321}
Iteration: 15 | Losses: {'ner': 761.737893326208}
Iteration: 16 | Losses: {'ner': 687.36905702285}
Iteration: 17 | Losses: {'ner': 785.5316795790568}
Iteration: 18 | Losses: {'ner': 676.8775275740772}
Iteration: 19 | Losses: {'ner': 683.31517

And as we were expecting, the Precision, Recall and F1 score are increased when they are trained separately.

In [61]:
check_correctness_by_country('JP', raw_data)

  gold = GoldParse(doc_gold_text, entities=annot)
  gold = GoldParse(doc_gold_text, entities=annot)


---- Results on JP addresses ----


Token,Precision,Recall,F1 score
area,64.761905,68.0,66.341463
country,100.0,66.666667,80.0
city,77.083333,86.046512,81.318681
postal,89.61039,94.520548,92.0
building,75.0,30.0,42.857143
nr,44.444444,13.333333,20.512821
street,50.0,44.736842,47.222222
co,51.351351,55.072464,53.146853
region,75.925926,67.213115,71.304348
Total,68.539326,66.510903,67.509881


## Why not trying new libraries
There are several libraries where NLP models can be done, among them Stanza library. The reason why we did not implement them is because, since we already have a pretty good model and Stanza is a whole new library, mostly recommended to identification of word types, such as pronouns, verbs and names, we found changing it all an unnecessary work. With that said, our improvement is mostly for Japanese addresses, that are a lot, and it really improved our results.

# Final solution

## train.py

In [None]:
import pandas as pd
import re
import spacy
import random
from utils import read_DataFrame_from_excel, resolve_model_name
from spacy.util import compounding, minibatch


TRAINING_DATA_FILENAME = './files/training_data_fixed.xlsx'
TRAINING_ENTRIES_COUNT = 999
TRAINED_MODEL_FILENAME = './models/trained_model'

TOKEN_TYPES: set = {'co', 'building', 'street', 'nr', 'area', 'postal', 'city', 'region', 'country'}

TRAIN_ITERATION_COUNT = 20
TRAIN_DROP_PROPERTY = 0.5


def preprocess_data(data: pd.DataFrame):
    """
    Performs data preprocessing by adding a space after each comma and semicolon if they are missing
    
    Args:
        dataFrame (pd.DataFrame): dataset to be processed

    Returns:
        None
    """
    for col in data.columns:
        data[col] = data.apply(lambda row: re.sub(r'([^\s])([,;])([^\s])', r'\1\2 \3', str(row[col])), axis=1)


def get_entity_list(entry: dict, adr: str):
    """
    Extracts an array of tuples, indicating positions of tokens in a provided address
    
    Args:
        entry (dict): dictionary, where keys are token types.
            Example:
            dict = {
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }
        adr (str): an address string.
            Example: 
            adr = 'Ozo g. 25, Vilnius'

    Returns:
        Array of tuples, where tuples follow structure of (token_position_start, token_position_end, token)
    """
    address = str(adr)
    entities: list = []
    present_tokens = filter(lambda item: item[0] in TOKEN_TYPES and item[1] and str(item[1]).strip(), entry.items())

    ## tokens to retry matching
    retry_tokens: set = set()

    for item in present_tokens:
        token_value = str(item[1]).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            # If multiple occurences can be matched, save the token to be matched later
            if (len(re.findall(re.escape(token_value), address)) > 1):
                retry_tokens.add((token_value, item[0]))
                continue
            span = match.span()
            entities.append((span[0], span[1], item[0]))
            # Replace matched entity with symbols, so that parts of it cannot be matched again
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            # Try and resolve multiple tokens separated by ';'
            split_items = map(lambda token: token.strip(), token_value.split(';'))
            for token in split_items:
                split_match = re.search(re.escape(token), address)
                if split_match:
                    # If multiple occurences can be matched, save the token to be matched later
                    if (len(re.findall(re.escape(token), address)) > 1):
                        retry_tokens.add((token, item[0]))
                        continue
                    span = split_match.span()
                    entities.append((span[0], span[1], item[0]))
                    # Replace matched entity with symbols, so that parts of it cannot be matched again
                    address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
                else:
                    print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))
    
    # Try and match previously marked tokens, now that single-match entities were eliminated
    for token, tkn_type in retry_tokens:
        token_value = str(token).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            span = match.span()
            entities.append((span[0], span[1], tkn_type))
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))

    return entities


def map_to_training_entry(entry: dict):
    """
    Maps an object of address tokens into a tuple of address string and an object containing entity list.
    
    Args:
        entry (dict): dictionary, where keys include token types.
            Example:
            dict = {
                'person_address': 'Ozo g. 25, Vilnius',
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }

    Returns:
        A tuple, where first element is the address, and the second one is an object containing the entity list
    """
    address = entry['person_address']
    return (address, {
        'entities': get_entity_list(entry, address)
    })


def entities_overlap(entry):
    """
    Checks whether an entry contains overlapping entities
    
    Args:
        entry (array or tuple): dictionary, where keys are token types.
            Example:
            dict = {
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }
        adr (str): an address string.
            Example: 
            adr = 'Ozo g. 25, Vilnius'

    Returns:
        Array of tuples, where tuples follow structure of (token_position_start, token_position_end, token)
    """
    entities = entry[1]['entities']
    for first in entities:
        for second in entities:
            if (first == second): continue
            if (first[0] < second[0] and first[1] > second[0]) or (first[0] > second[0] and first[1] < second[0]) or (first[0]==second[0] or first[1]==second[1]):
                print('Entities {} and {} overlap in "{}"'.format(first, second, entry[0]))
                return True
    return False


def train_model(entries: pd.DataFrame, model_filename: str):
    """
    Train a NER model from a given input dataframe and saves the model to disk.

    Args:
        entries - a pandas DataFrame containing the training data
        model_filename - where on disk to output the model
    """
    train_data = map(map_to_training_entry, entries.to_dict('records'))
    train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)

    for token in TOKEN_TYPES:
        ner.add_label(token)
    
    print('--- TRAINING {} MODEL IN {} ITERATIONS | DROP = {} ---'.format(model_filename, TRAIN_ITERATION_COUNT, TRAIN_DROP_PROPERTY))
    print('--- TRAIN DATA SIZE: {} ---'.format(len(entries)))
    optimizer = nlp.begin_training()

    for itn in range(TRAIN_ITERATION_COUNT):
        random.shuffle(train_data)
        losses = {}

        batches = minibatch(train_data, size=compounding(4, 32, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,  
                drop=TRAIN_DROP_PROPERTY,  
                sgd=optimizer,
                losses=losses)
        print('Iteration: {} | Losses: {}'.format(itn, losses))

    nlp.to_disk(model_filename)


def train_model_without_countries(raw_data: pd.DataFrame, exclude: list):
    train_model(raw_data[~raw_data['person_ctry_code'].isin(exclude)], resolve_model_name())


def train_model_for_country(raw_data: pd.DataFrame, country_code: str):
    train_model(raw_data[raw_data['person_ctry_code'] == country_code], resolve_model_name(country_code))

if __name__ == '__main__':
    raw_data: pd.DataFrame = read_DataFrame_from_excel(TRAINING_DATA_FILENAME, TRAINING_ENTRIES_COUNT)
    preprocess_data(raw_data)

    train_model_for_country(raw_data, 'JP')
    train_model_without_countries(raw_data, ['JP'])

## deploy.py

In [None]:
import pandas as pd
import re
import spacy
import random
from utils import read_DataFrame_from_excel, resolve_model_name
from spacy.util import compounding, minibatch


TRAINING_DATA_FILENAME = './files/training_data_fixed.xlsx'
TRAINING_ENTRIES_COUNT = 999
TRAINED_MODEL_FILENAME = './models/trained_model'

TOKEN_TYPES: set = {'co', 'building', 'street', 'nr', 'area', 'postal', 'city', 'region', 'country'}

TRAIN_ITERATION_COUNT = 20
TRAIN_DROP_PROPERTY = 0.5


def preprocess_data(data: pd.DataFrame):
    """
    Performs data preprocessing by adding a space after each comma and semicolon if they are missing
    
    Args:
        dataFrame (pd.DataFrame): dataset to be processed

    Returns:
        None
    """
    for col in data.columns:
        data[col] = data.apply(lambda row: re.sub(r'([^\s])([,;])([^\s])', r'\1\2 \3', str(row[col])), axis=1)


def get_entity_list(entry: dict, adr: str):
    """
    Extracts an array of tuples, indicating positions of tokens in a provided address
    
    Args:
        entry (dict): dictionary, where keys are token types.
            Example:
            dict = {
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }
        adr (str): an address string.
            Example: 
            adr = 'Ozo g. 25, Vilnius'

    Returns:
        Array of tuples, where tuples follow structure of (token_position_start, token_position_end, token)
    """
    address = str(adr)
    entities: list = []
    present_tokens = filter(lambda item: item[0] in TOKEN_TYPES and item[1] and str(item[1]).strip(), entry.items())

    ## tokens to retry matching
    retry_tokens: set = set()

    for item in present_tokens:
        token_value = str(item[1]).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            # If multiple occurences can be matched, save the token to be matched later
            if (len(re.findall(re.escape(token_value), address)) > 1):
                retry_tokens.add((token_value, item[0]))
                continue
            span = match.span()
            entities.append((span[0], span[1], item[0]))
            # Replace matched entity with symbols, so that parts of it cannot be matched again
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            # Try and resolve multiple tokens separated by ';'
            split_items = map(lambda token: token.strip(), token_value.split(';'))
            for token in split_items:
                split_match = re.search(re.escape(token), address)
                if split_match:
                    # If multiple occurences can be matched, save the token to be matched later
                    if (len(re.findall(re.escape(token), address)) > 1):
                        retry_tokens.add((token, item[0]))
                        continue
                    span = split_match.span()
                    entities.append((span[0], span[1], item[0]))
                    # Replace matched entity with symbols, so that parts of it cannot be matched again
                    address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
                else:
                    print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))
    
    # Try and match previously marked tokens, now that single-match entities were eliminated
    for token, tkn_type in retry_tokens:
        token_value = str(token).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            span = match.span()
            entities.append((span[0], span[1], tkn_type))
            address = address[:span[0]] + '$' * (span[1] - span[0]) + address[span[1]:]
        else:
            print('WARNING: could not find token "{}" in address "{}"'.format(token, adr))

    return entities


def map_to_training_entry(entry: dict):
    """
    Maps an object of address tokens into a tuple of address string and an object containing entity list.
    
    Args:
        entry (dict): dictionary, where keys include token types.
            Example:
            dict = {
                'person_address': 'Ozo g. 25, Vilnius',
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }

    Returns:
        A tuple, where first element is the address, and the second one is an object containing the entity list
    """
    address = entry['person_address']
    return (address, {
        'entities': get_entity_list(entry, address)
    })


def entities_overlap(entry):
    """
    Checks whether an entry contains overlapping entities
    
    Args:
        entry (array or tuple): dictionary, where keys are token types.
            Example:
            dict = {
                'city': 'Vilnius',
                'street': 'Ozo g.',
                'nr': 25
            }
        adr (str): an address string.
            Example: 
            adr = 'Ozo g. 25, Vilnius'

    Returns:
        Array of tuples, where tuples follow structure of (token_position_start, token_position_end, token)
    """
    entities = entry[1]['entities']
    for first in entities:
        for second in entities:
            if (first == second): continue
            if (first[0] < second[0] and first[1] > second[0]) or (first[0] > second[0] and first[1] < second[0]) or (first[0]==second[0] or first[1]==second[1]):
                print('Entities {} and {} overlap in "{}"'.format(first, second, entry[0]))
                return True
    return False


def train_model(entries: pd.DataFrame, model_filename: str):
    """
    Train a NER model from a given input dataframe and saves the model to disk.

    Args:
        entries - a pandas DataFrame containing the training data
        model_filename - where on disk to output the model
    """
    train_data = map(map_to_training_entry, entries.to_dict('records'))
    train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)

    for token in TOKEN_TYPES:
        ner.add_label(token)
    
    print('--- TRAINING {} MODEL IN {} ITERATIONS | DROP = {} ---'.format(model_filename, TRAIN_ITERATION_COUNT, TRAIN_DROP_PROPERTY))
    print('--- TRAIN DATA SIZE: {} ---'.format(len(entries)))
    optimizer = nlp.begin_training()

    for itn in range(TRAIN_ITERATION_COUNT):
        random.shuffle(train_data)
        losses = {}

        batches = minibatch(train_data, size=compounding(4, 32, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,  
                drop=TRAIN_DROP_PROPERTY,  
                sgd=optimizer,
                losses=losses)
        print('Iteration: {} | Losses: {}'.format(itn, losses))

    nlp.to_disk(model_filename)


def train_model_without_countries(raw_data: pd.DataFrame, exclude: list):
    train_model(raw_data[~raw_data['person_ctry_code'].isin(exclude)], resolve_model_name())


def train_model_for_country(raw_data: pd.DataFrame, country_code: str):
    train_model(raw_data[raw_data['person_ctry_code'] == country_code], resolve_model_name(country_code))


# Version 2

if __name__ == '__main__':
    raw_data: pd.DataFrame = read_DataFrame_from_excel(TRAINING_DATA_FILENAME, TRAINING_ENTRIES_COUNT)
    preprocess_data(raw_data)

    train_model_for_country(raw_data, 'JP')
    train_model_without_countries(raw_data, ['JP'])