In [7]:
import pandas as pd

def read_DataFrame_from_file(filename: str, numberOfRows: int = None):
    return pd.read_excel(filename, nrows = numberOfRows, keep_default_na=False)

In [8]:
DATA_INPUT_FILENAME = 'training_data.xlsx'
NUMBER_OF_PARSABLE_RECORDS = 999

def preprocess_data(data: pd.DataFrame):
    data['person_address'] = data.apply(lambda row: re.sub(r'([^\s])(,)([^\s])', r'\1, \3', row['person_address']), axis=1)
    print(data)

raw_data: pd.DataFrame = read_DataFrame_from_file(DATA_INPUT_FILENAME, NUMBER_OF_PARSABLE_RECORDS)
preprocess_data(raw_data)

     person_id                                        person_name  \
0         3540                                   PURAC Biochem BV   
1        28753                               Tinti, Maria Ornella   
2        35108  Isobe, Shin-ichi, c/o Int. Prop. Dpt., NTT DoC...   
3        89830                                      Hata, Yoshiki   
4        94063                                       Kim, Se-Jong   
..         ...                                                ...   
994   67732461                                          ZHANG, Ou   
995   68271080                                  HIGUCHI, Shinichi   
996   68607095                                      WANG, Zhiyong   
997   68861617                                 GRIJO Patrik Nolan   
998   69712319  VSEROSSIISKII GOSUDARSTVENNOI NAUTUNO-ISSLEDOV...   

                                        person_address  \
0                    Arkelsedijk 46, 4206 AC Gorinchem   
1    SIGMA-TAU Industrie Farmaceutiche Riunite S.p.... 

In [6]:
import re
token_types: set = {'co', 'building', 'street', 'nr', 'area', 'postal', 'city', 'region', 'country'}


def get_entity_list(entry: dict, address: str):
    entities: list = []
    present_tokens = filter(lambda item: item[0] in token_types and item[1] and str(item[1]).strip(), entry.items())

    for item in present_tokens:
        token_value = str(item[1]).strip()
        match = re.search(re.escape(token_value), address)
        if match:
            span = match.span()
            entities.append((span[0], span[1], item[0]))
        else:
            # Try and resolve multiple tokens separated by ';'
            split_items = map(lambda token: token.strip(), token_value.split(';'))
            for token in split_items:
                split_match = re.search(re.escape(token), address)
                if split_match:
                    span = split_match.span()
                    entities.append((span[0], span[1], item[0]))
                else:
                    print('WARNING: could not find token "{}" in address "{}"'.format(token, address))
    
    return entities


def map_to_training_entry(entry: dict):
    address = entry['person_address']
    print(address, {
        'entities': get_entity_list(entry, address)
    })
    return (address, {
        'entities': get_entity_list(entry, address)
    })

train_data = list(
    map(map_to_training_entry, raw_data.to_dict('records'))
)
# print(train_data)

Arkelsedijk 46, 4206 AC Gorinchem {'entities': [(0, 11, 'street'), (12, 14, 'nr'), (16, 23, 'postal'), (24, 33, 'city')]}
SIGMA-TAU Industrie Farmaceutiche Riunite S.p.A., Via Pontina, km 30, 400 00040, Pomezia {'entities': [(0, 48, 'co'), (50, 61, 'street'), (63, 68, 'nr'), (70, 79, 'postal'), (81, 88, 'city')]}
Sanno Park Tower, 11-1, Nagatacho 2-chome, Chiyoda-ku, Tokyo {'entities': [(0, 16, 'building'), (43, 53, 'area'), (55, 60, 'city')]}
c/o Hitachi Appliances, Inc. 390 Muramatsu, Shimizu-ku, Shizuoka-shi, Shizuoka-ken 424-0926 {'entities': [(0, 27, 'co'), (70, 82, 'area'), (83, 91, 'postal'), (33, 42, 'region')]}
Legal & IP Team, Samsung SDI Co., Ltd., 428-5, Gongse-ri, Kiheung-eup, Yongin-si, Gyeonggi-do {'entities': [(40, 45, 'street'), (47, 56, 'area'), (58, 69, 'area'), (71, 79, 'area'), (82, 93, 'city')]}
Pappelweg 9, 15809 Gross Machnow {'entities': [(0, 9, 'street'), (10, 11, 'nr'), (13, 18, 'postal'), (19, 32, 'city')]}
1 Gwihyun-dong, Changwon, Kyungsangnam-do {'entitie

In [182]:
def entities_overlap(entry):
    entities = entry[1]['entities']
    for first in entities:
        for second in entities:
            if (first == second): continue
            if (first[0] < second[0] and first[1] > second[0]) or (first[0] > second[0] and first[1] < second[0]) or (first[0]==second[0] or first[1]==second[1]):
                print('Entities {} and {} overlap in "{}"'.format(first, second, entry[0]))
                return True
    return False

train_data = list(filter(lambda entry: not entities_overlap(entry), train_data))

Entities (25, 33, 'area') and (25, 30, 'region') overlap in "2-15, Meiwadori 3-chome, Hyogo-ku, Kobe-shi, Hyogo 652-0882"
Entities (35, 44, 'area') and (35, 40, 'city') overlap in "1-201, Fukuzaki 3-chome, Minato-ku Osaka-shi, Osaka 552-0013"
Entities (26, 38, 'street') and (26, 38, 'area') overlap in "Yamaha Corporation, 10-1, Nakazawa-cho, Hamamatsu-shi, Shizuoka-ken"
Entities (81, 84, 'nr') and (81, 84, 'postal') overlap in "c/o Toshiba Corporation, Intellectual Prop. Div., 1-1-1 Shibaura Minato-ku Tokyo 105"
Entities (23, 31, 'postal') and (23, 25, 'region') overlap in "803 Kirts Blvd., Troy, MI 48084"
Entities (24, 32, 'postal') and (24, 26, 'region') overlap in "2 Sconsett Bluff, Avon, CT 06001"
Entities (36, 45, 'city') and (36, 41, 'region') overlap in "12-4, Sagisu 5-chome, Fukushima-ku, Osaka-shi, Osaka; 5530 002"
Entities (50, 60, 'city') and (50, 56, 'region') overlap in "c/o Shinko El. Ind. Co., Ltd. 80, Oshimada-machi, Nagano-shi Nagano 381-2287"
Entities (0, 19, 'street'

In [172]:
import spacy
import random

nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for token in token_types:
    ner.add_label(token)

optimizer = nlp.begin_training()
for itn in range(20):
    random.shuffle(train_data)
    losses = {}

    batches = spacy.util.minibatch(train_data, size=spacy.util.compounding(4, 32, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  
            annotations,  
            drop=0.5,  
            sgd=optimizer,
            losses=losses)

  proc.begin_training(
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldPars

In [173]:
nlp.to_disk('trained_model_0')
nlp2 = spacy.load('./trained_model_0')

for text, _ in train_data:
    doc = nlp2(text)
    print('{} Entities'.format(text), [(ent.text, ent.label_) for ent in doc.ents])

[('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000014F4F46C9A0>)]
[('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000014F58651AC0>)]
HSINCHU CITY Entities [('HSINCHU CITY', 'city')]
c/o Asahi Glass Company, Limited, 5-1, Marunouchi 1-chome, Chiyoda-ku, Tokyo 100-8405 Entities [('c/o Asahi Glass Company', 'co'), ('Limited', 'city'), ('5-1', 'street'), ('Marunouchi 1-chome', 'street'), ('Chiyoda-ku', 'area'), ('Tokyo', 'city'), ('100-8405', 'postal')]
920 EAST CREEK DRIVE, DRIPPING SPRINS, TEXAS 78620, E.U.A. US Entities [('920', 'nr'), ('EAST CREEK DRIVE', 'street'), ('DRIPPING SPRINS', 'city'), ('TEXAS', 'region'), ('78620', 'postal')]
Standartizatsii i Sertifikatsii Veterinarnôh Preparatov, Zvenigorodskoje u. 5 , Moskva, RUSSIAN FEDERATION Entities [('Standartizatsii i Sertifikatsii Veterinarnôh Preparatov', 'co'), ('Zvenigorodskoje u.', 'street'), ('5', 'nr'), ('Moskva', 'city'), ('RUSSIAN FEDERATION', 'region')]
2-2, MINAMIDAMACHI-2-CHOME, TOYAMA-SHI Entit

In [None]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def map_to_evaluation_model(entry: tuple):
    return (entry[0], entry[1]['entities'])


def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

results = evaluate(nlp, map(map_to_evaluation_model, train_data))
print(results)

ModuleNotFoundError: No module named 'spacy.gold'