# Training Spacy Model

- Training on customized entities using Spacy's pre-trained model 

- Updating and adding new entities to NLPruler

- Beam search algorithm for confidence score of extracted entities

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import re
import string
import pdftotext # For pdfto text conversion
import docx2txt # for converting docx to .txt format
from collections import Counter
import sys
import pandas as pd
from collections import defaultdict
import codecs # for encoding scheme of text files
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

# 2)-Training Entities

### 2.1)- Training Titles

In [5]:
title1  = ["Agreement on Managed Data Center Services"]
title2  = ["Master Services Agreement on the Provision of IT Services"]
title3  = ["Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)"]
title4  = ["MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS"]
title5  = ["Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds"]
title6  = ["Agreement on the Provision of MANAGED PRINT Services"]
title7  = ["Agreement on the Provision of MPS (Managed Print Services)"]
title8  = ["Agreement for Security Operation Center Services"]
title9  = ["AGREEMENT ON PROVISIONING OF IT AND COMMUNICATION SERVICES"]
title10 = ["Agreement on Managed Data Center Services"]
title11 = ["Master Project, Support and Maintenance Agreement"]
title12 = ["ENTERPRISE CUSTOMER AGREEMENT"]
title13 = ["AGREEMENT on the provision of managed Mobile communication Services"]
title14 = ["MASTER SERVICE AGREEMENT"]
title15 = ["Agreement for Security Operation Center Services"]

In [6]:
Titles = [title2,title2, title3, title4, title5, title6, title7, title8, title9, title10, title11, title12, title13, title14,title15]

In [7]:
TRAIN_DATA=[('Agreement on Managed Data Center Services', {'entities': [(0, len(title1[0]), 'TITLE')]})]
start=0
end=len(title1[0])
for title in Titles:
    start=end+1
    end=start+len(title[0])
    TRAIN_DATA.append(    (title[0], { 'entities': [(start, end , 'TITLE')]}) )   

In [8]:
print(TRAIN_DATA)

[('Agreement on Managed Data Center Services', {'entities': [(0, 41, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(42, 99, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(100, 157, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)', {'entities': [(158, 260, 'TITLE')]}), ('MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS', {'entities': [(261, 340, 'TITLE')]}), ('Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds', {'entities': [(341, 465, 'TITLE')]}), ('Agreement on the Provision of MANAGED PRINT Services', {'entities': [(466, 518, 'TITLE')]}), ('Agreement on the Provision of MPS (Managed Print Services)', {'entities': [(519, 577, 'TITLE')]}), ('Agreement for Security Operation Center Services', {'entities': [(578, 626, 'TITLE')]}),

# Start training Title Entity

In [9]:
# Define our variables and model path to be laoded
model = None
output_dir=Path("/Users/hassansherwani/Documents/Python/Spacy")
n_iter=100

In [10]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [12]:
# new entity label
LABEL = 'TITLE'

In [13]:
def extract_title(model=None, new_model_name='TITLE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for TITLE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [14]:
# Run our Function
extract_title()

Created blank 'en' model


100%|██████████| 16/16 [00:00<00:00, 65.17it/s]
 50%|█████     | 8/16 [00:00<00:00, 78.73it/s]

{'ner': 52.435285100486}


100%|██████████| 16/16 [00:00<00:00, 74.06it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.67it/s]

{'ner': 2.0051951662490137}


100%|██████████| 16/16 [00:00<00:00, 72.95it/s]
 50%|█████     | 8/16 [00:00<00:00, 75.61it/s]

{'ner': 2.0000000198337875}


100%|██████████| 16/16 [00:00<00:00, 73.61it/s]
 50%|█████     | 8/16 [00:00<00:00, 68.57it/s]

{'ner': 1.9999964656131137}


100%|██████████| 16/16 [00:00<00:00, 72.17it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.61it/s]

{'ner': 1.999532006123562}


100%|██████████| 16/16 [00:00<00:00, 72.91it/s]
 44%|████▍     | 7/16 [00:00<00:00, 68.09it/s]

{'ner': 2.279190199220771}


100%|██████████| 16/16 [00:00<00:00, 72.40it/s]
 50%|█████     | 8/16 [00:00<00:00, 72.76it/s]

{'ner': 1.9975813838015315}


100%|██████████| 16/16 [00:00<00:00, 73.69it/s]
 44%|████▍     | 7/16 [00:00<00:00, 68.88it/s]

{'ner': 1.9939663740359561}


100%|██████████| 16/16 [00:00<00:00, 73.42it/s]
 50%|█████     | 8/16 [00:00<00:00, 71.91it/s]

{'ner': 1.8739948219089257}


100%|██████████| 16/16 [00:00<00:00, 73.66it/s]
 44%|████▍     | 7/16 [00:00<00:00, 64.23it/s]

{'ner': 1.9977507801571903}


100%|██████████| 16/16 [00:00<00:00, 72.79it/s]
 50%|█████     | 8/16 [00:00<00:00, 76.93it/s]

{'ner': 2.0018146582995375}


100%|██████████| 16/16 [00:00<00:00, 73.22it/s]
 50%|█████     | 8/16 [00:00<00:00, 70.05it/s]

{'ner': 2.002128539952392}


100%|██████████| 16/16 [00:00<00:00, 73.14it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.10it/s]

{'ner': 2.0012061426803895}


100%|██████████| 16/16 [00:00<00:00, 73.47it/s]
 50%|█████     | 8/16 [00:00<00:00, 67.88it/s]

{'ner': 11.342907059458238}


100%|██████████| 16/16 [00:00<00:00, 73.58it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.01it/s]

{'ner': 2.0387027841098733}


100%|██████████| 16/16 [00:00<00:00, 72.74it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.23it/s]

{'ner': 1.9480695505579733}


100%|██████████| 16/16 [00:00<00:00, 73.99it/s]
 50%|█████     | 8/16 [00:00<00:00, 75.57it/s]

{'ner': 4.948707973874568}


100%|██████████| 16/16 [00:00<00:00, 69.31it/s]
 44%|████▍     | 7/16 [00:00<00:00, 67.94it/s]

{'ner': 1.9953518941328796}


100%|██████████| 16/16 [00:00<00:00, 71.87it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.03it/s]

{'ner': 1.9835424501350531}


100%|██████████| 16/16 [00:00<00:00, 67.25it/s]

{'ner': 4.4135879682912345}
Entities in 'Trained completed for TITLE entity.'





### 2.2)- Training Supplier Entity

In [15]:
suppliers1 = ["TEASYS"]
suppliers2 = ["Teasys"]
suppliers3 = ["TEASYS GLOBAL INVEST AG"]
suppliers4 = ["Teasys Global Invest AG"]
suppliers5 = ["teasys global invest ag"]
suppliers6 = ["FTP"]
suppliers7 = ["FTP Deutschland GmbH"]
suppliers8 = ["FTP Deutschland GmbH"]
suppliers9 = ["Wisniewski & Sohn GmbH"]
suppliers10 = ["FBS"]
suppliers11 = ["Horizon Deutschland AG"]
suppliers12 = ["Horizon"]
suppliers13 = ["Harpe"]
suppliers14 = ["Harpe Deutschland GmbH"]
suppliers15 = ["ADVENTURE SERVICES GMBH"]
suppliers16 = ["Adventure Services GmbH"]
suppliers17 = ["SWIPERO LIMITED"]
suppliers18 = ["Swipero Limited"]
suppliers19 = ["Swipero"]
suppliers20 = ["Nozama Net Service"]
suppliers21 = ["NOZAMA NET SERVICE"]
suppliers22 = ["Schwyz Mail Solutions GmbH"]
suppliers23 = ["Verizon Deutschland GmbH"]


In [16]:
suppliers = [suppliers2,suppliers3,suppliers4,suppliers5,suppliers6,suppliers7,suppliers8,
            suppliers9,suppliers10,suppliers11,suppliers12,suppliers13,suppliers14,suppliers15,suppliers16,
            suppliers17,suppliers18,suppliers19, suppliers20,suppliers21,suppliers22,suppliers23]

In [17]:
TRAIN_DATA=[("TEASYS", {'entities': [(0, len(suppliers1[0]), 'SUPPLIER')]})]
start=0
end=len(suppliers1[0])
for supplier in suppliers:
    start=end+1
    end=start+len(supplier[0])
    TRAIN_DATA.append(    (supplier[0], { 'entities': [(start, end , 'SUPPLIER')]}) ) 

In [18]:
print(TRAIN_DATA)

[('TEASYS', {'entities': [(0, 6, 'SUPPLIER')]}), ('Teasys', {'entities': [(7, 13, 'SUPPLIER')]}), ('TEASYS GLOBAL INVEST AG', {'entities': [(14, 37, 'SUPPLIER')]}), ('Teasys Global Invest AG', {'entities': [(38, 61, 'SUPPLIER')]}), ('teasys global invest ag', {'entities': [(62, 85, 'SUPPLIER')]}), ('FTP', {'entities': [(86, 89, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(90, 110, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(111, 131, 'SUPPLIER')]}), ('Wisniewski & Sohn GmbH', {'entities': [(132, 154, 'SUPPLIER')]}), ('FBS', {'entities': [(155, 158, 'SUPPLIER')]}), ('Horizon Deutschland AG', {'entities': [(159, 181, 'SUPPLIER')]}), ('Horizon', {'entities': [(182, 189, 'SUPPLIER')]}), ('Harpe', {'entities': [(190, 195, 'SUPPLIER')]}), ('Harpe Deutschland GmbH', {'entities': [(196, 218, 'SUPPLIER')]}), ('ADVENTURE SERVICES GMBH', {'entities': [(219, 242, 'SUPPLIER')]}), ('Adventure Services GmbH', {'entities': [(243, 266, 'SUPPLIER')]}), ('SWIPERO LIMITED', {'entiti

In [19]:
# new entity label
LABEL = 'SUPPLIER'

In [20]:
def extract_supplier(model=None, new_model_name='SUPPLIER', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for SUPPLIER entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [21]:
# Run our Function
extract_supplier()

Created blank 'en' model


100%|██████████| 23/23 [00:00<00:00, 79.14it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.94it/s]

{'ner': 16.74900277562672}


100%|██████████| 23/23 [00:00<00:00, 87.12it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.50it/s]

{'ner': 1.9986699254524383}


100%|██████████| 23/23 [00:00<00:00, 97.49it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.51it/s]

{'ner': 1.996586390935281}


100%|██████████| 23/23 [00:00<00:00, 98.53it/s]
 43%|████▎     | 10/23 [00:00<00:00, 93.97it/s]

{'ner': 1.8307292795423964}


100%|██████████| 23/23 [00:00<00:00, 95.09it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.81it/s]

{'ner': 1.8804470466608694}


100%|██████████| 23/23 [00:00<00:00, 94.19it/s]
 43%|████▎     | 10/23 [00:00<00:00, 92.90it/s]

{'ner': 1.5704700659791182}


100%|██████████| 23/23 [00:00<00:00, 91.14it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.53it/s]

{'ner': 0.7698465206648359}


100%|██████████| 23/23 [00:00<00:00, 95.44it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.27it/s]

{'ner': 1.9635555879519404}


100%|██████████| 23/23 [00:00<00:00, 91.10it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.23it/s]

{'ner': 2.0063314292567767}


100%|██████████| 23/23 [00:00<00:00, 96.35it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.13it/s]

{'ner': 0.37618345657481655}


100%|██████████| 23/23 [00:00<00:00, 96.29it/s]
 39%|███▉      | 9/23 [00:00<00:00, 89.62it/s]

{'ner': 0.24653992655274307}


100%|██████████| 23/23 [00:00<00:00, 92.79it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.92it/s]

{'ner': 1.999904036521951}


100%|██████████| 23/23 [00:00<00:00, 94.75it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.25it/s]

{'ner': 0.1735746757834584}


100%|██████████| 23/23 [00:00<00:00, 94.31it/s]
 39%|███▉      | 9/23 [00:00<00:00, 88.10it/s]

{'ner': 0.27866557828476235}


100%|██████████| 23/23 [00:00<00:00, 90.86it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.53it/s]

{'ner': 9.27963235593777e-05}


100%|██████████| 23/23 [00:00<00:00, 95.07it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.17it/s]

{'ner': 1.7998504948530346e-08}


100%|██████████| 23/23 [00:00<00:00, 96.61it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.30it/s]

{'ner': 8.934755481667129e-07}


100%|██████████| 23/23 [00:00<00:00, 97.22it/s]
 39%|███▉      | 9/23 [00:00<00:00, 82.14it/s]

{'ner': 1.0090262258247398e-07}


100%|██████████| 23/23 [00:00<00:00, 85.00it/s]
 43%|████▎     | 10/23 [00:00<00:00, 93.63it/s]

{'ner': 1.89149047206314e-09}


100%|██████████| 23/23 [00:00<00:00, 90.65it/s]

{'ner': 0.033997682367053444}
Entities in 'Trained completed for SUPPLIER entity.'





### 2.3)- Training Client Entity

In [22]:
clients1 = ["F.UN"]
clients2 = ["FUN"]
clients3 = ["F.UN BUSINESS SERVICES GMBH"]
clients4 = ["F.UN Business Services GmbH"]
clients5 = ["F.UN Business Services Gmbh"]

In [23]:
clients = [clients2,clients3,clients4,clients5]

In [24]:
TRAIN_DATA=[("F.UN", {'entities': [(0, len(clients1[0]), 'CLIENT')]})]
start=0
end=len(clients1[0])
for client in clients:
    start=end+1
    end=start+len(client[0])
    TRAIN_DATA.append(    (client[0], { 'entities': [(start, end , 'CLIENT')]}) )

In [25]:
print(TRAIN_DATA)

[('F.UN', {'entities': [(0, 4, 'CLIENT')]}), ('FUN', {'entities': [(5, 8, 'CLIENT')]}), ('F.UN BUSINESS SERVICES GMBH', {'entities': [(9, 36, 'CLIENT')]}), ('F.UN Business Services GmbH', {'entities': [(37, 64, 'CLIENT')]}), ('F.UN Business Services Gmbh', {'entities': [(65, 92, 'CLIENT')]})]


In [26]:
# new entity label
LABEL = 'CLIENT'

In [27]:
def extract_client(model=None, new_model_name='CLIENT', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for CLIENT entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)


In [28]:
extract_client()

Created blank 'en' model


100%|██████████| 5/5 [00:00<00:00, 73.63it/s]
100%|██████████| 5/5 [00:00<00:00, 92.77it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 9.033649280667305}
{'ner': 6.479349724948406}


100%|██████████| 5/5 [00:00<00:00, 87.65it/s]
100%|██████████| 5/5 [00:00<00:00, 91.89it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 3.506996516138315}
{'ner': 2.2681233918992803}


100%|██████████| 5/5 [00:00<00:00, 95.26it/s]
100%|██████████| 5/5 [00:00<00:00, 99.65it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.9955286299314139}
{'ner': 1.9916828940785098}


100%|██████████| 5/5 [00:00<00:00, 98.01it/s]
100%|██████████| 5/5 [00:00<00:00, 99.41it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.999166968271468}
{'ner': 1.9976523730154945}


100%|██████████| 5/5 [00:00<00:00, 92.72it/s]
100%|██████████| 5/5 [00:00<00:00, 90.57it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.992380347663199}
{'ner': 1.8520911290637327}


100%|██████████| 5/5 [00:00<00:00, 86.81it/s]
100%|██████████| 5/5 [00:00<00:00, 95.39it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.7086376070735887}
{'ner': 0.6766176413904098}


100%|██████████| 5/5 [00:00<00:00, 98.06it/s]
100%|██████████| 5/5 [00:00<00:00, 99.52it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.832845278929554}
{'ner': 1.8428983500782268}


100%|██████████| 5/5 [00:00<00:00, 97.87it/s]
100%|██████████| 5/5 [00:00<00:00, 94.73it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.8232847003590699}
{'ner': 0.49101646308326663}


100%|██████████| 5/5 [00:00<00:00, 91.94it/s]
100%|██████████| 5/5 [00:00<00:00, 96.01it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.6402773435495825}
{'ner': 0.1277916070847916}


100%|██████████| 5/5 [00:00<00:00, 96.77it/s]
100%|██████████| 5/5 [00:00<00:00, 98.79it/s]

{'ner': 0.4369164731390499}
{'ner': 0.028647689487016934}
Entities in 'Trained completed for CLIENT entity.'





### 2.4)-For Dates

- 1-EFFECTIVE_DATE
- 2-Signature Date
- 3-Termination Date
- 4-Commencement Date
- 5-End Date

#### 2.4.a.EFFECTIVE_DATE

In [29]:
dates1 = ["29 September 2018"]
dates2 = ["01 January 2015"]
dates3 = ["01.07.2018"]
dates4 = ["August 2017"]
dates5 = ["6 December 2016"]
dates6 = ["December 2015"]

In [30]:
eff_dates = [dates2,dates3,dates4,dates5,dates6]

In [31]:
TRAIN_DATA=[("29 September 2018", {'entities': [(0, len(dates1[0]), 'EFFECTIVE_DATE')]})]
start=0
end=len(dates1[0])
for date in eff_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'EFFECTIVE_DATE')]}) )

In [32]:
LABEL ="EFFECTIVE_DATE"

In [33]:
def extract_effective_dates(model=None, new_model_name='EFFECTIVE_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for EFFECTIVE_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [34]:
extract_effective_dates()

100%|██████████| 6/6 [00:00<00:00, 78.74it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 10.30357140302658}


100%|██████████| 6/6 [00:00<00:00, 97.54it/s]
100%|██████████| 6/6 [00:00<00:00, 100.58it/s]
100%|██████████| 6/6 [00:00<00:00, 102.37it/s]
100%|██████████| 6/6 [00:00<00:00, 103.64it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 6.269028350710869}
{'ner': 3.4171867585973814}
{'ner': 1.5350218202005408}
{'ner': 1.6487036570983336}


100%|██████████| 6/6 [00:00<00:00, 100.07it/s]
100%|██████████| 6/6 [00:00<00:00, 99.85it/s]
100%|██████████| 6/6 [00:00<00:00, 89.85it/s]
100%|██████████| 6/6 [00:00<00:00, 95.19it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.9347324424038088}
{'ner': 1.4597308347704}
{'ner': 1.1353474978828975}
{'ner': 4.583332219072757}


100%|██████████| 6/6 [00:00<00:00, 96.71it/s]
100%|██████████| 6/6 [00:00<00:00, 99.77it/s]
100%|██████████| 6/6 [00:00<00:00, 102.76it/s]
100%|██████████| 6/6 [00:00<00:00, 95.76it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 4.496348779913544}
{'ner': 2.3104241753891577}
{'ner': 1.7793649710933408}
{'ner': 1.4606332281888106}


100%|██████████| 6/6 [00:00<00:00, 94.03it/s]
100%|██████████| 6/6 [00:00<00:00, 97.60it/s]
100%|██████████| 6/6 [00:00<00:00, 102.22it/s]
100%|██████████| 6/6 [00:00<00:00, 99.77it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.4546013291582018}
{'ner': 0.7485857754563525}
{'ner': 0.32172870775165097}
{'ner': 0.030788899752578643}


100%|██████████| 6/6 [00:00<00:00, 97.41it/s]
100%|██████████| 6/6 [00:00<00:00, 101.80it/s]
100%|██████████| 6/6 [00:00<00:00, 97.79it/s]

{'ner': 0.035624344387169916}
{'ner': 0.016909284573464918}
{'ner': 1.9079024015676046e-05}
Entities in 'Trained completed for EFFECTIVE_DATE entity.'





#### 2.4.b.Signature Date

In [35]:
dates1 = ["31. July 2018"]
dates2 = ["August 30, 2017"]

In [36]:
sig_dates = [dates2]

In [37]:
TRAIN_DATA=[("31. July 2018", {'entities': [(0, len(dates1[0]), 'SIGNATURE_DATE')]})]
start=0
end=len(dates1[0])
for date in sig_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'SIGNATURE_DATE')]}) )

In [38]:
LABEL ="SIGNATURE_DATE"

In [39]:
def extract_sign_dates(model=None, new_model_name='SIGNATURE_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for SIGNATURE_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [40]:
extract_sign_dates()

100%|██████████| 2/2 [00:00<00:00, 49.50it/s]
100%|██████████| 2/2 [00:00<00:00, 75.93it/s]
100%|██████████| 2/2 [00:00<00:00, 74.69it/s]
100%|██████████| 2/2 [00:00<00:00, 75.15it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 6.053876906633377}
{'ner': 5.426923424005508}
{'ner': 4.637376815080643}
{'ner': 3.8786805272102356}


100%|██████████| 2/2 [00:00<00:00, 76.33it/s]
100%|██████████| 2/2 [00:00<00:00, 75.23it/s]
100%|██████████| 2/2 [00:00<00:00, 75.43it/s]
100%|██████████| 2/2 [00:00<00:00, 78.61it/s]
100%|██████████| 2/2 [00:00<00:00, 79.22it/s]
100%|██████████| 2/2 [00:00<00:00, 76.49it/s]
100%|██████████| 2/2 [00:00<00:00, 80.04it/s]
100%|██████████| 2/2 [00:00<00:00, 73.44it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

{'ner': 2.7813880443573}
{'ner': 1.496376033872366}
{'ner': 1.7697368653607555}
{'ner': 1.3944001489580842}
{'ner': 1.5814265888475347}
{'ner': 1.6313601114852645}
{'ner': 1.6431465404888854}
{'ner': 1.2683692755062452}


100%|██████████| 2/2 [00:00<00:00, 74.66it/s]
100%|██████████| 2/2 [00:00<00:00, 80.56it/s]
100%|██████████| 2/2 [00:00<00:00, 80.99it/s]
100%|██████████| 2/2 [00:00<00:00, 83.61it/s]
100%|██████████| 2/2 [00:00<00:00, 81.92it/s]
100%|██████████| 2/2 [00:00<00:00, 82.90it/s]
100%|██████████| 2/2 [00:00<00:00, 82.15it/s]
100%|██████████| 2/2 [00:00<00:00, 82.63it/s]

{'ner': 1.490145317937131}
{'ner': 5.865847913742545}
{'ner': 6.331122940740414}
{'ner': 6.162900563512863}
{'ner': 4.929626142758082}
{'ner': 3.335532743439229}
{'ner': 3.450244697142516}
{'ner': 3.1720150279145543}
Entities in 'Trained completed for SIGNATURE_DATE entity.'





#### 2.4.c.Termination Date

In [41]:
dates1 = ["period of 48 months"]
dates2 = ["36 months"]

In [42]:
ter_dates = [dates2]

In [43]:
TRAIN_DATA=[("period of 48 months", {'entities': [(0, len(dates1[0]), 'TERMINATION_DATE')]})]
start=0
end=len(dates1[0])
for date in ter_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'TERMINATION_DATE')]}) )

In [44]:
LABEL = "TERMINATION_DATE"

In [45]:
def extract_termination_dates(model=None, new_model_name='TERMINATION_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for TERMINATION_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [46]:
extract_termination_dates()

100%|██████████| 2/2 [00:00<00:00, 52.61it/s]
100%|██████████| 2/2 [00:00<00:00, 88.64it/s]
100%|██████████| 2/2 [00:00<00:00, 89.09it/s]
100%|██████████| 2/2 [00:00<00:00, 86.55it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 4.943156003952026}
{'ner': 4.733199179172516}
{'ner': 4.356341063976288}
{'ner': 3.5204191505908966}


100%|██████████| 2/2 [00:00<00:00, 82.57it/s]
100%|██████████| 2/2 [00:00<00:00, 87.85it/s]
100%|██████████| 2/2 [00:00<00:00, 90.77it/s]
100%|██████████| 2/2 [00:00<00:00, 91.79it/s]
100%|██████████| 2/2 [00:00<00:00, 94.12it/s]
100%|██████████| 2/2 [00:00<00:00, 90.31it/s]
100%|██████████| 2/2 [00:00<00:00, 91.02it/s]
100%|██████████| 2/2 [00:00<00:00, 85.55it/s]
100%|██████████| 2/2 [00:00<00:00, 91.38it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

{'ner': 2.9071980714797974}
{'ner': 2.270190954208374}
{'ner': 1.7271071132272482}
{'ner': 1.337209813296795}
{'ner': 1.469145882758312}
{'ner': 1.3036364294675877}
{'ner': 1.0354187743846524}
{'ner': 0.9137580034630446}
{'ner': 1.7502376503074402}


100%|██████████| 2/2 [00:00<00:00, 87.78it/s]
100%|██████████| 2/2 [00:00<00:00, 91.55it/s]
100%|██████████| 2/2 [00:00<00:00, 89.61it/s]
100%|██████████| 2/2 [00:00<00:00, 91.30it/s]
100%|██████████| 2/2 [00:00<00:00, 93.38it/s]
100%|██████████| 2/2 [00:00<00:00, 90.62it/s]
100%|██████████| 2/2 [00:00<00:00, 89.49it/s]

{'ner': 6.1775904347072625}
{'ner': 0.8103851367929167}
{'ner': 5.661051008140973}
{'ner': 5.383663495234032}
{'ner': 3.7841036609124217}
{'ner': 2.8038497413799632}
{'ner': 3.456122545365247}
Entities in 'Trained completed for TERMINATION_DATE entity.'





#### 2.4.d.Commencement Date

In [47]:
dates1 = ["31.01.2017"]
dates2 = ["31.03.2019"]
dates3 = ["1 October 2018"]
dates4 = ["September 1st, 2017"]

In [48]:
comm_dates = [dates2,dates3,dates4]

In [49]:
TRAIN_DATA=[("31.01.2017", {'entities': [(0, len(dates1[0]), 'COMMENCEMENT_DATE')]})]
start=0
end=len(dates1[0])
for date in comm_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'COMMENCEMENT_DATE')]}) )

In [50]:
LABEL ="COMMENCEMENT_DATE"

In [51]:
def extract_commencement_dates(model=None, new_model_name='COMMENCEMENT_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for Commencement Date entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [52]:
extract_commencement_dates()

100%|██████████| 4/4 [00:00<00:00, 73.81it/s]
100%|██████████| 4/4 [00:00<00:00, 98.24it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 7.089648544788361}
{'ner': 6.0705262422561646}


100%|██████████| 4/4 [00:00<00:00, 93.96it/s]
100%|██████████| 4/4 [00:00<00:00, 94.29it/s]
100%|██████████| 4/4 [00:00<00:00, 90.35it/s]
100%|██████████| 4/4 [00:00<00:00, 96.63it/s]
100%|██████████| 4/4 [00:00<00:00, 92.45it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 4.51494225859642}
{'ner': 3.0207329988479614}
{'ner': 1.4106221608817577}
{'ner': 1.6813012509373948}
{'ner': 1.7543828021160834}


100%|██████████| 4/4 [00:00<00:00, 94.00it/s]
100%|██████████| 4/4 [00:00<00:00, 98.63it/s]
100%|██████████| 4/4 [00:00<00:00, 94.16it/s]
100%|██████████| 4/4 [00:00<00:00, 99.85it/s]
100%|██████████| 4/4 [00:00<00:00, 98.70it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 1.9406036896752994}
{'ner': 1.321605007190616}
{'ner': 1.668143688781521}
{'ner': 0.8650690611055069}
{'ner': 1.2267703778363552}


100%|██████████| 4/4 [00:00<00:00, 96.86it/s]
100%|██████████| 4/4 [00:00<00:00, 92.98it/s]
100%|██████████| 4/4 [00:00<00:00, 91.82it/s]
100%|██████████| 4/4 [00:00<00:00, 94.44it/s]
100%|██████████| 4/4 [00:00<00:00, 93.89it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 0.5571290441307533}
{'ner': 0.7114385982803557}
{'ner': 0.4528223641294699}
{'ner': 0.5023184886911283}
{'ner': 0.9247863397019926}


100%|██████████| 4/4 [00:00<00:00, 96.13it/s]
100%|██████████| 4/4 [00:00<00:00, 93.30it/s]
100%|██████████| 4/4 [00:00<00:00, 94.57it/s]

{'ner': 0.029219874588463278}
{'ner': 0.007346255715742007}
{'ner': 0.0019570532583987406}
Entities in 'Trained completed for Commencement Date entity.'





#### 2.4.e.End Date

In [53]:
dates1 = ["31.12.2018"]
dates2 = ["Apr 11th 2023"]
dates3 = ["19.01.2020"]
dates4 = ["July 31"]
dates5 = ["2017"]

In [54]:
end_dates = [dates2,dates3,dates4]

In [55]:
TRAIN_DATA=[("31.12.2018", {'entities': [(0, len(dates1[0]), 'END_DATE')]})]
start=0
end=len(dates1[0])
for date in end_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'END_DATE')]}) )

In [56]:
LABEL ="END_DATE"

In [57]:
def extract_end_dates(model=None, new_model_name='END_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for END_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [58]:
extract_end_dates()

100%|██████████| 4/4 [00:00<00:00, 69.88it/s]
100%|██████████| 4/4 [00:00<00:00, 95.73it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 4.947470456361771}
{'ner': 4.0748313665390015}


100%|██████████| 4/4 [00:00<00:00, 91.88it/s]
100%|██████████| 4/4 [00:00<00:00, 97.24it/s]
100%|██████████| 4/4 [00:00<00:00, 99.59it/s]
100%|██████████| 4/4 [00:00<00:00, 100.93it/s]
100%|██████████| 4/4 [00:00<00:00, 84.38it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 2.7247445061802864}
{'ner': 2.191546332091093}
{'ner': 1.4127049886155874}
{'ner': 1.8131061284839234}
{'ner': 1.649262026409815}


100%|██████████| 4/4 [00:00<00:00, 78.19it/s]
100%|██████████| 4/4 [00:00<00:00, 92.55it/s]
100%|██████████| 4/4 [00:00<00:00, 93.52it/s]
100%|██████████| 4/4 [00:00<00:00, 97.04it/s]
100%|██████████| 4/4 [00:00<00:00, 98.48it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 1.918912617567969}
{'ner': 1.5959832744164064}
{'ner': 1.5920226904661938}
{'ner': 1.0450129452198968}
{'ner': 0.6247774846432943}


100%|██████████| 4/4 [00:00<00:00, 95.78it/s]
100%|██████████| 4/4 [00:00<00:00, 98.19it/s]
100%|██████████| 4/4 [00:00<00:00, 100.63it/s]
100%|██████████| 4/4 [00:00<00:00, 99.07it/s]
100%|██████████| 4/4 [00:00<00:00, 101.58it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

{'ner': 0.4037211252523479}
{'ner': 0.22260800075199222}
{'ner': 0.11878446683609399}
{'ner': 0.18414629697058355}
{'ner': 0.34372974077741314}


100%|██████████| 4/4 [00:00<00:00, 91.34it/s]
100%|██████████| 4/4 [00:00<00:00, 81.90it/s]
100%|██████████| 4/4 [00:00<00:00, 93.45it/s]

{'ner': 0.0012590960008085263}
{'ner': 0.0008138548026595006}
{'ner': 3.798874985644923e-06}
Entities in 'Trained completed for END_DATE entity.'





### 2.5)- Countries

In [59]:
countries1 = ["UK"]
countries2 = ["Germany"]
countries3 = ["France"]
countries4 = ["Italy"]
countries5 = ["Netherlands"]
countries6 = ["Russia"]
countries7 = ["Hungary"]
countries8 = ["India"]
countries9 = ["Slovakia"]
countries10 = ["Czech"]
countries11 = ["Australia"]
countries12 = ["Vietnam"]
countries13 = ["Japan"]
countries14 = ["Philippines"]
countries15 = ["Romania"]
countries16 = ["Sweden"]
countries17 = ["Czech Republic"]
countries18 = ["United Kingdom"]
countries19 = ["Switzerland"]

In [60]:
countr = [countries2,countries3,countries4,countries5,countries6,countries7,countries8,countries9,
         countries10,countries11,countries12,countries13,countries14,countries15,countries16,countries17,
         countries18,countries19]

In [61]:
TRAIN_DATA=[("UK", {'entities': [(0, len(countries1[0]), 'COUNTRIES')]})]
start=0
end=len(dates1[0])
for country in countr:
    start=end+1
    end=start+len(country[0])
    TRAIN_DATA.append(    (country[0], { 'entities': [(start, end , 'COUNTRIES')]}) )

In [62]:
LABEL ="COUNTRIES"

In [63]:
def extract_countries(model=None, new_model_name='COUNTRIES', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for COUNTRIES entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [64]:
extract_countries()

 47%|████▋     | 9/19 [00:00<00:00, 81.83it/s]

Created blank 'en' model


100%|██████████| 19/19 [00:00<00:00, 79.07it/s]
100%|██████████| 19/19 [00:00<00:00, 102.50it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 7.701164316560607}
{'ner': 2.0027100906128954}


100%|██████████| 19/19 [00:00<00:00, 90.51it/s]
100%|██████████| 19/19 [00:00<00:00, 94.50it/s]

{'ner': 1.9981808834804895}



100%|██████████| 19/19 [00:00<00:00, 104.54it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 1.972686264959852}
{'ner': 0.7604672268679362}


100%|██████████| 19/19 [00:00<00:00, 85.84it/s]
100%|██████████| 19/19 [00:00<00:00, 105.15it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 0.5250287853463376}
{'ner': 0.19536710097971688}


100%|██████████| 19/19 [00:00<00:00, 92.57it/s]
100%|██████████| 19/19 [00:00<00:00, 105.87it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 0.054136164835510696}
{'ner': 0.017382937662132742}


100%|██████████| 19/19 [00:00<00:00, 104.63it/s]
100%|██████████| 19/19 [00:00<00:00, 97.58it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 1.73794436138067e-05}
{'ner': 1.463857707091081e-06}


100%|██████████| 19/19 [00:00<00:00, 103.54it/s]
100%|██████████| 19/19 [00:00<00:00, 99.81it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 0.00016493705034498386}
{'ner': 2.7257451091900185e-10}


100%|██████████| 19/19 [00:00<00:00, 104.86it/s]
100%|██████████| 19/19 [00:00<00:00, 105.04it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 6.709131347594189e-09}
{'ner': 4.740873786632132e-06}


100%|██████████| 19/19 [00:00<00:00, 104.18it/s]
100%|██████████| 19/19 [00:00<00:00, 106.94it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 3.036144710781991e-07}
{'ner': 4.027426137865939e-09}


100%|██████████| 19/19 [00:00<00:00, 95.55it/s]
100%|██████████| 19/19 [00:00<00:00, 102.10it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

{'ner': 5.03091968963497e-09}
{'ner': 7.276473127447225e-09}


100%|██████████| 19/19 [00:00<00:00, 103.94it/s]

{'ner': 1.3457693911205835e-06}
Entities in 'Trained completed for COUNTRIES entity.'





# 3)-Reassuring list of Data

In [74]:
# for title
title1  = ["Agreement on Managed Data Center Services"]
title2  = ["Master Services Agreement on the Provision of IT Services"]
title3  = ["Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)"]
title4  = ["MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS"]
title5  = ["Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds"]
title6  = ["Agreement on the Provision of MANAGED PRINT Services"]
title7  = ["Agreement on the Provision of MPS (Managed Print Services)"]
title8  = ["Agreement for Security Operation Center Services"]
title9  = ["AGREEMENT ON PROVISIONING OF IT AND COMMUNICATION SERVICES"]
title10 = ["Agreement on Managed Data Center Services"]
title11 = ["Master Project, Support and Maintenance Agreement"]
title12 = ["ENTERPRISE CUSTOMER AGREEMENT"]
title13 = ["AGREEMENT on the provision of managed Mobile communication Services"]
title14 = ["MASTER SERVICE AGREEMENT"]
title15 = ["Agreement for Security Operation Center Services"]
#for supplier
suppliers1 = ["TEASYS", "Teasys", "TEASYS GLOBAL INVEST AG", "Teasys Global Invest AG","teasys global invest ag"]
suppliers2 = ["FTP", "FTP Deutschland GmbH", "FTP Deutschland GmbH"]
suppliers3 = ["Wisniewski & Sohn GmbH", "FBS"]
suppliers4 = ["Horizon Deutschland AG", "Horizon", "Harpe", "Harpe Deutschland GmbH"]
suppliers5 = ["ADVENTURE SERVICES GMBH", "Adventure Services GmbH", "SWIPERO LIMITED", "Swipero Limited",
                          "Swipero"]
suppliers6 = ["Nozama Net Service","NOZAMA NET SERVICE"]
suppliers7 = ["Schwyz Mail Solutions GmbH"]
suppliers8 = ["Verizon Deutschland GmbH"]
# for client
clients = ["F.UN", "FUN", "F.UN BUSINESS SERVICES GMBH", "F.UN Business Services GmbH"]
#for client manager
cli_cont_managr1 = ["Amanda Kyzwani"]
#for supplier manager
supp_cont_manar1 = ["Tim Big"]
# for Dates
dates1 = ["29 September 2018", "01 January 2015", "01.07.2018"," August 2017","6 December 2016","December 2015"]
dates2 = ["31. July 2018","August 30, 2017"]
dates3 = ["period of 48 months","36 months"]
dates4 = [ "31.01.2017", "31.03.2019", "1 October 2018","September 1st, 2017"]
dates5 = ["31.12.2018", "Apr 11th 2023","19.01.2020","July 31, 2017"]
# Countries
countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]
countries3 = ["Sweden","Czech Republic","United Kingdom","Switzerland"]

# 4)- Buidling NLPruler using all trained entities

In [75]:
rulerAll = EntityRuler(nlp, overwrite_ents=True)

In [76]:
# For Title entity

for tit1 in title1:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

for tit2 in title2:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

for tit3 in title3:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

for tit4 in title4:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

for tit5 in title5:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

for tit6 in title6:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

for tit7 in title7:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

for tit8 in title8:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])
    
for tit9 in title9:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])
    
for tit10 in title10:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit10}])

for tit11 in title11:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit11}])

for tit12 in title12:
	rulerAll.add_patterns([{"label": "TITLE", "pattern": tit12}])
    
# for supplier

for s1 in suppliers1:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

for s2 in suppliers2:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

for s3 in suppliers3:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

for s4 in suppliers4:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

for s5 in suppliers5:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])
    
for s6 in suppliers6:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s6}])
    
for s7 in suppliers7:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s7}])
    
for s8 in suppliers8:
    rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s8}])

# for clients

for c1 in clients:
	rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

# Pattern for DATES

for t1 in dates1:
	rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

for t2 in dates2:
	rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

for t3 in dates3:
	rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

for t4 in dates4:
	rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

for t5 in dates5:
	rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

# for countries

for count1 in countries1:
    rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

for count2 in countries2:
    rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])
    
for count3 in countries3:
    rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count3}])

#CLIENT_CONTRACT_MANAGER

for c_mangr in cli_cont_managr1:
    rulerAll.add_patterns([{"label": "CLIENT_CONTRACT_MANAGER", "pattern": c_mangr}])

# SUPPLIER_CONTRACT_MANAGER

for supp_mangr in supp_cont_manar1:
    rulerAll.add_patterns([{"label": "SUPPLIER_CONTRACT_MANAGER", "pattern": supp_mangr}])

In [71]:
rulerAll = EntityRuler(nlp, overwrite_ents=True)

In [77]:
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [78]:
rulerAll.name = 'rulerAll'
nlp.add_pipe(rulerAll)
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'rulerAll']


# 5)-  Test_text

In [79]:
text = """Agreement for Security Operation Center Services between Verizon Deutschland GmbH Sebrathweg 20 D-44149 Dortmund‑ hereinafter referred to as “Contractor” ‑and F.UN Business Services GmbH Humboldtstraße 33 D-30169 Hannover‑ hereinafter referred to as “EBS” - both hereinafter collectively referred to as the “Contracting Parties” ‑ October 09, 2018."""

In [80]:
#convert to spacy token
with nlp.disable_pipes('ner'):
    doc = nlp(text)

# 6)- Confidence Score

Using beam algorithm

In [81]:
from spacy.scorer import Scorer
scorer = Scorer()

In [82]:
threshold = 0.2
beams = nlp.entity.beam_parse([ doc ], beam_width = 3, beam_density = 0.0001)
beams

[<thinc.extra.search.Beam at 0x11ee845f0>]

In [84]:
entity_scores = defaultdict(float)
for beam in beams:
    for score, ents in nlp.entity.moves.get_beam_parses(beam):
        for start, end, label in ents:
            entity_scores[(start, end, label)] += score
ent_found=[]
ent_label=[]
ent_score=[]

for key in entity_scores:
    start, end, label = key
    score = entity_scores[key]
    if ( score > threshold):
        ent_found.append(label)
        ent_label.append(doc[start:end])
        ent_score.append(score)

In [85]:
df_ent_score = pd.DataFrame({'ENT_DETECT': [], 'ENT_LABEL': [],'CONFIDENCE':[]})
df_ent_score['ENT_DETECT']=ent_found
df_ent_score['ENT_LABEL']=ent_label
df_ent_score['CONFIDENCE']=ent_score
df_ent_score[(df_ent_score.ENT_DETECT=="TITLE") | (df_ent_score.ENT_DETECT=="CLIENT") |(df_ent_score.ENT_DETECT=="SUPPLIER")
             | (df_ent_score.ENT_DETECT=="COUNTRIES")| (df_ent_score.ENT_DETECT=="Effective-DATES")| (df_ent_score.ENT_DETECT=="Signature-DATES")
             | (df_ent_score.ENT_DETECT=="Termination-DATES")| (df_ent_score.ENT_DETECT=="Commencement-DATES")| (df_ent_score.ENT_DETECT=="END-DATES")
             | (df_ent_score.ENT_DETECT=="CLIENT_CONTRACT_MANAGER")| (df_ent_score.ENT_DETECT=="SUPPLIER_CONTRACT_MANAGER")]

Unnamed: 0,ENT_DETECT,ENT_LABEL,CONFIDENCE
0,TITLE,"(Agreement, for, Security, Operation, Center, ...",1.0
1,SUPPLIER,"(Verizon, Deutschland, GmbH)",1.0
3,CLIENT,"(F.UN, Business, Services, GmbH)",1.0


# END