# Training Spacy Model

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
import re
import string
import pandas as pd
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

# 2)-Training Entities

### 2.1)- Training Titles

In [5]:
title1  = ["Agreement on Managed Data Center Services"]
title2  = ["Master Services Agreement on the Provision of IT Services"]
title3  = ["Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)"]
title4  = ["MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS"]
title5  = ["Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds"]
title6  = ["Agreement on the Provision of MANAGED PRINT Services"]
title7  = ["Agreement on the Provision of MPS (Managed Print Services)"]
title8  = ["Agreement for Security Operation Center Services"]
title9  = ["AGREEMENT ON PROVISIONING OF IT AND COMMUNICATION SERVICES"]
title10 = ["Agreement on Managed Data Center Services"]
title11 = ["Master Project, Support and Maintenance Agreement"]
title12 = ["ENTERPRISE CUSTOMER AGREEMENT"]
title13 = ["AGREEMENT on the provision of managed Mobile communication Services"]
title14 = ["MASTER SERVICE AGREEMENT"]
title15 = ["Agreement for Security Operation Center Services"]

In [6]:
Titles = [title2,title2, title3, title4, title5, title6, title7, title8, title9, title10, title11, title12, title13, title14,title15]

In [7]:
TRAIN_DATA=[('Agreement on Managed Data Center Services', {'entities': [(0, len(title1[0]), 'TITLE')]})]
start=0
end=len(title1[0])
for title in Titles:
    start=end+1
    end=start+len(title[0])
    TRAIN_DATA.append(    (title[0], { 'entities': [(start, end , 'TITLE')]}) )   

In [8]:
print(TRAIN_DATA)

[('Agreement on Managed Data Center Services', {'entities': [(0, 41, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(42, 99, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(100, 157, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)', {'entities': [(158, 260, 'TITLE')]}), ('MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS', {'entities': [(261, 340, 'TITLE')]}), ('Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds', {'entities': [(341, 465, 'TITLE')]}), ('Agreement on the Provision of MANAGED PRINT Services', {'entities': [(466, 518, 'TITLE')]}), ('Agreement on the Provision of MPS (Managed Print Services)', {'entities': [(519, 577, 'TITLE')]}), ('Agreement for Security Operation Center Services', {'entities': [(578, 626, 'TITLE')]}),

# Start training Title Entity

In [9]:
# Define our variables and model path to be laoded
model = None
output_dir=Path("/Users/hassansherwani/Documents/Python/Spacy")
n_iter=100

In [10]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [12]:
# new entity label
LABEL = 'TITLE'

In [13]:
def extract_title(model=None, new_model_name='TITLE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for TITLE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [14]:
# Run our Function
extract_title()

Created blank 'en' model


100%|██████████| 16/16 [00:00<00:00, 64.88it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.37it/s]

{'ner': 60.51998977167386}


100%|██████████| 16/16 [00:00<00:00, 74.14it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.82it/s]

{'ner': 2.002526590367576}


100%|██████████| 16/16 [00:00<00:00, 71.62it/s]
 50%|█████     | 8/16 [00:00<00:00, 76.81it/s]

{'ner': 2.0000000000036713}


100%|██████████| 16/16 [00:00<00:00, 72.52it/s]
 50%|█████     | 8/16 [00:00<00:00, 71.78it/s]

{'ner': 2.0000000919416285}


100%|██████████| 16/16 [00:00<00:00, 72.86it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.33it/s]

{'ner': 1.9986847411539674}


100%|██████████| 16/16 [00:00<00:00, 72.82it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.04it/s]

{'ner': 2.135781039047962}


100%|██████████| 16/16 [00:00<00:00, 72.21it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.18it/s]

{'ner': 1.9966063640570786}


100%|██████████| 16/16 [00:00<00:00, 72.00it/s]
 50%|█████     | 8/16 [00:00<00:00, 75.47it/s]

{'ner': 2.0000408445912035}


100%|██████████| 16/16 [00:00<00:00, 73.45it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.69it/s]

{'ner': 1.9765049039612907}


100%|██████████| 16/16 [00:00<00:00, 70.74it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.95it/s]

{'ner': 2.0141488461971155}


100%|██████████| 16/16 [00:00<00:00, 71.77it/s]
 50%|█████     | 8/16 [00:00<00:00, 69.90it/s]

{'ner': 1.684016176106284}


100%|██████████| 16/16 [00:00<00:00, 68.85it/s]
 50%|█████     | 8/16 [00:00<00:00, 79.47it/s]

{'ner': 2.4662655303508316}


100%|██████████| 16/16 [00:00<00:00, 72.39it/s]
 50%|█████     | 8/16 [00:00<00:00, 78.26it/s]

{'ner': 2.0988210450073344}


100%|██████████| 16/16 [00:00<00:00, 71.78it/s]
 50%|█████     | 8/16 [00:00<00:00, 71.33it/s]

{'ner': 1.996878275361673}


100%|██████████| 16/16 [00:00<00:00, 72.24it/s]
 50%|█████     | 8/16 [00:00<00:00, 75.08it/s]

{'ner': 2.0518299015175434}


100%|██████████| 16/16 [00:00<00:00, 72.22it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.83it/s]

{'ner': 1.9978471565619436}


100%|██████████| 16/16 [00:00<00:00, 72.81it/s]
 56%|█████▋    | 9/16 [00:00<00:00, 83.13it/s]

{'ner': 2.4950549698902873}


100%|██████████| 16/16 [00:00<00:00, 72.54it/s]
 44%|████▍     | 7/16 [00:00<00:00, 67.54it/s]

{'ner': 1.9411763723591742}


100%|██████████| 16/16 [00:00<00:00, 71.83it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.46it/s]

{'ner': 2.0000381066986783}


100%|██████████| 16/16 [00:00<00:00, 72.67it/s]

{'ner': 9.974401021135241}
Entities in 'Trained completed for TITLE entity.'





### 2.2)- Training Supplier Entity

In [15]:
suppliers1 = ["TEASYS"]
suppliers2 = ["Teasys"]
suppliers3 = ["TEASYS GLOBAL INVEST AG"]
suppliers4 = ["Teasys Global Invest AG"]
suppliers5 = ["teasys global invest ag"]
suppliers6 = ["FTP"]
suppliers7 = ["FTP Deutschland GmbH"]
suppliers8 = ["FTP Deutschland GmbH"]
suppliers9 = ["Wisniewski & Sohn GmbH"]
suppliers10 = ["FBS"]
suppliers11 = ["Horizon Deutschland AG"]
suppliers12 = ["Horizon"]
suppliers13 = ["Harpe"]
suppliers14 = ["Harpe Deutschland GmbH"]
suppliers15 = ["ADVENTURE SERVICES GMBH"]
suppliers16 = ["Adventure Services GmbH"]
suppliers17 = ["SWIPERO LIMITED"]
suppliers18 = ["Swipero Limited"]
suppliers19 = ["Swipero"]
suppliers20 = ["Nozama Net Service"]
suppliers21 = ["NOZAMA NET SERVICE"]
suppliers22 = ["Schwyz Mail Solutions GmbH"]
suppliers23 = ["Verizon Deutschland GmbH"]


In [16]:
suppliers = [suppliers2,suppliers3,suppliers4,suppliers5,suppliers6,suppliers7,suppliers8,
            suppliers9,suppliers10,suppliers11,suppliers12,suppliers13,suppliers14,suppliers15,suppliers16,
            suppliers17,suppliers18,suppliers19, suppliers20,suppliers21,suppliers22,suppliers23]

In [17]:
TRAIN_DATA=[("TEASYS", {'entities': [(0, len(suppliers1[0]), 'SUPPLIER')]})]
start=0
end=len(suppliers1[0])
for supplier in suppliers:
    start=end+1
    end=start+len(supplier[0])
    TRAIN_DATA.append(    (supplier[0], { 'entities': [(start, end , 'SUPPLIER')]}) ) 

In [18]:
print(TRAIN_DATA)

[('TEASYS', {'entities': [(0, 6, 'SUPPLIER')]}), ('Teasys', {'entities': [(7, 13, 'SUPPLIER')]}), ('TEASYS GLOBAL INVEST AG', {'entities': [(14, 37, 'SUPPLIER')]}), ('Teasys Global Invest AG', {'entities': [(38, 61, 'SUPPLIER')]}), ('teasys global invest ag', {'entities': [(62, 85, 'SUPPLIER')]}), ('FTP', {'entities': [(86, 89, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(90, 110, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(111, 131, 'SUPPLIER')]}), ('Wisniewski & Sohn GmbH', {'entities': [(132, 154, 'SUPPLIER')]}), ('FBS', {'entities': [(155, 158, 'SUPPLIER')]}), ('Horizon Deutschland AG', {'entities': [(159, 181, 'SUPPLIER')]}), ('Horizon', {'entities': [(182, 189, 'SUPPLIER')]}), ('Harpe', {'entities': [(190, 195, 'SUPPLIER')]}), ('Harpe Deutschland GmbH', {'entities': [(196, 218, 'SUPPLIER')]}), ('ADVENTURE SERVICES GMBH', {'entities': [(219, 242, 'SUPPLIER')]}), ('Adventure Services GmbH', {'entities': [(243, 266, 'SUPPLIER')]}), ('SWIPERO LIMITED', {'entiti

In [19]:
# new entity label
LABEL = 'SUPPLIER'

In [20]:
def extract_supplier(model=None, new_model_name='SUPPLIER', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for SUPPLIER entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [21]:
# Run our Function
extract_supplier()

Created blank 'en' model


100%|██████████| 23/23 [00:00<00:00, 90.20it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.48it/s]

{'ner': 20.828335899832382}


100%|██████████| 23/23 [00:00<00:00, 97.06it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.13it/s]

{'ner': 1.9621293089619274}


100%|██████████| 23/23 [00:00<00:00, 95.43it/s]
 43%|████▎     | 10/23 [00:00<00:00, 92.61it/s]

{'ner': 1.9997165612492285}


100%|██████████| 23/23 [00:00<00:00, 93.77it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.91it/s]

{'ner': 1.999008047578653}


100%|██████████| 23/23 [00:00<00:00, 97.03it/s]
 43%|████▎     | 10/23 [00:00<00:00, 93.80it/s]

{'ner': 1.9315424464626312}


100%|██████████| 23/23 [00:00<00:00, 96.11it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.90it/s]

{'ner': 1.938999886770669}


100%|██████████| 23/23 [00:00<00:00, 98.81it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.48it/s]

{'ner': 1.7621035302801915}


100%|██████████| 23/23 [00:00<00:00, 95.67it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.23it/s]

{'ner': 0.2606256821624407}


100%|██████████| 23/23 [00:00<00:00, 97.67it/s]
 48%|████▊     | 11/23 [00:00<00:00, 100.59it/s]

{'ner': 0.513857896635444}


100%|██████████| 23/23 [00:00<00:00, 97.26it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 98.88it/s]

{'ner': 0.01739760301023017}


100%|██████████| 23/23 [00:00<00:00, 99.10it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.89it/s]

{'ner': 0.007478549574456397}


100%|██████████| 23/23 [00:00<00:00, 96.18it/s]
 43%|████▎     | 10/23 [00:00<00:00, 99.57it/s]

{'ner': 0.004246786854980652}


100%|██████████| 23/23 [00:00<00:00, 97.99it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 97.43it/s]

{'ner': 2.1490215111997477}


100%|██████████| 23/23 [00:00<00:00, 96.59it/s]
 48%|████▊     | 11/23 [00:00<00:00, 101.48it/s]

{'ner': 4.51043578928763e-06}


100%|██████████| 23/23 [00:00<00:00, 98.38it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 98.57it/s]

{'ner': 1.918183357929619e-06}


100%|██████████| 23/23 [00:00<00:00, 96.21it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.90it/s]

{'ner': 0.00033322394570448716}


100%|██████████| 23/23 [00:00<00:00, 97.04it/s]
 43%|████▎     | 10/23 [00:00<00:00, 98.73it/s]

{'ner': 8.75548268572628e-09}


100%|██████████| 23/23 [00:00<00:00, 96.81it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.01it/s]

{'ner': 1.5742011014391503e-09}


100%|██████████| 23/23 [00:00<00:00, 90.28it/s]
 39%|███▉      | 9/23 [00:00<00:00, 89.66it/s]

{'ner': 3.96604538967088e-09}


100%|██████████| 23/23 [00:00<00:00, 83.62it/s]

{'ner': 1.6639237861188606e-07}
Entities in 'Trained completed for SUPPLIER entity.'





### 2.3)- Training Client Entity

In [22]:
clients1 = ["F.UN"]
clients2 = ["FUN"]
clients3 = ["F.UN BUSINESS SERVICES GMBH"]
clients4 = ["F.UN Business Services GmbH"]
clients5 = ["F.UN Business Services Gmbh"]

In [23]:
clients = [clients2,clients3,clients4,clients5]

In [24]:
TRAIN_DATA=[("F.UN", {'entities': [(0, len(clients1[0]), 'CLIENT')]})]
start=0
end=len(clients1[0])
for client in clients:
    start=end+1
    end=start+len(client[0])
    TRAIN_DATA.append(    (client[0], { 'entities': [(start, end , 'CLIENT')]}) )

In [25]:
print(TRAIN_DATA)

[('F.UN', {'entities': [(0, 4, 'CLIENT')]}), ('FUN', {'entities': [(5, 8, 'CLIENT')]}), ('F.UN BUSINESS SERVICES GMBH', {'entities': [(9, 36, 'CLIENT')]}), ('F.UN Business Services GmbH', {'entities': [(37, 64, 'CLIENT')]}), ('F.UN Business Services Gmbh', {'entities': [(65, 92, 'CLIENT')]})]


In [26]:
# new entity label
LABEL = 'CLIENT'

In [27]:
def extract_client(model=None, new_model_name='CLIENT', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for CLIENT entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)


In [28]:
extract_client()

Created blank 'en' model


100%|██████████| 5/5 [00:00<00:00, 55.79it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 9.293746873736382}


100%|██████████| 5/5 [00:00<00:00, 79.01it/s]
100%|██████████| 5/5 [00:00<00:00, 91.52it/s]
100%|██████████| 5/5 [00:00<00:00, 92.21it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 6.2023725882172585}
{'ner': 3.233329739421606}
{'ner': 1.6969372120165644}


100%|██████████| 5/5 [00:00<00:00, 90.49it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.7312981267545402}


100%|██████████| 5/5 [00:00<00:00, 88.52it/s]
100%|██████████| 5/5 [00:00<00:00, 93.40it/s]
100%|██████████| 5/5 [00:00<00:00, 94.56it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.898283965912694}
{'ner': 1.9746101853484292}
{'ner': 1.874770919901006}


100%|██████████| 5/5 [00:00<00:00, 82.30it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.9484791273563136}


100%|██████████| 5/5 [00:00<00:00, 59.63it/s]
100%|██████████| 5/5 [00:00<00:00, 57.54it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.313150381496463}
{'ner': 1.5978639036347486}


100%|██████████| 5/5 [00:00<00:00, 57.52it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.6651197049097841}


100%|██████████| 5/5 [00:00<00:00, 52.46it/s]
100%|██████████| 5/5 [00:00<00:00, 61.78it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.7181362955660672}
{'ner': 0.4106546340895002}


100%|██████████| 5/5 [00:00<00:00, 70.99it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.1099900756564245}


100%|██████████| 5/5 [00:00<00:00, 85.98it/s]
100%|██████████| 5/5 [00:00<00:00, 88.88it/s]
100%|██████████| 5/5 [00:00<00:00, 87.58it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.1539290872341344}
{'ner': 0.34035893339953566}
{'ner': 0.004204121510027885}


100%|██████████| 5/5 [00:00<00:00, 73.45it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.000358960268647179}


100%|██████████| 5/5 [00:00<00:00, 64.43it/s]

{'ner': 0.00037192883479112646}
Entities in 'Trained completed for CLIENT entity.'





### 2.4)-For Dates

- 1-EFFECTIVE_DATE
- 2-Signature Date
- 3-Termination Date
- 4-Commencement Date
- 5-End Date

#### 2.4.a.EFFECTIVE_DATE

In [29]:
dates1 = ["29 September 2018"]
dates2 = ["01 January 2015"]
dates3 = ["01.07.2018"]
dates4 = ["August 2017"]
dates5 = ["6 December 2016"]
dates6 = ["December 2015"]

In [30]:
eff_dates = [dates2,dates3,dates4,dates5,dates6]

In [31]:
TRAIN_DATA=[("29 September 2018", {'entities': [(0, len(dates1[0]), 'EFFECTIVE_DATE')]})]
start=0
end=len(dates1[0])
for date in eff_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'EFFECTIVE_DATE')]}) )

In [32]:
LABEL ="EFFECTIVE_DATE"

In [33]:
def extract_effective_dates(model=None, new_model_name='EFFECTIVE_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for EFFECTIVE_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [34]:
extract_effective_dates()

Created blank 'en' model


100%|██████████| 6/6 [00:00<00:00, 74.55it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 10.522645115852356}


100%|██████████| 6/6 [00:00<00:00, 66.73it/s]
100%|██████████| 6/6 [00:00<00:00, 84.51it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 7.407541900873184}
{'ner': 2.1654501035809517}


100%|██████████| 6/6 [00:00<00:00, 83.53it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.9853608075111424}


100%|██████████| 6/6 [00:00<00:00, 80.46it/s]
100%|██████████| 6/6 [00:00<00:00, 80.25it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.9862690375294818}
{'ner': 1.952861649585425}


100%|██████████| 6/6 [00:00<00:00, 87.63it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.6768141658626217}


100%|██████████| 6/6 [00:00<00:00, 86.85it/s]
100%|██████████| 6/6 [00:00<00:00, 89.32it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.8315753486471092}
{'ner': 4.566283622699573}


100%|██████████| 6/6 [00:00<00:00, 91.38it/s]
100%|██████████| 6/6 [00:00<00:00, 93.89it/s]


{'ner': 1.0705107101267108}
{'ner': 4.573198500053397}


100%|██████████| 6/6 [00:00<00:00, 87.99it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 2.2150161481342185}


100%|██████████| 6/6 [00:00<00:00, 89.22it/s]
100%|██████████| 6/6 [00:00<00:00, 83.30it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.7825203577866016}
{'ner': 2.0173687677627012}


100%|██████████| 6/6 [00:00<00:00, 81.95it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.9320981770302932}


100%|██████████| 6/6 [00:00<00:00, 80.61it/s]
100%|██████████| 6/6 [00:00<00:00, 87.49it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.6002779233553119}
{'ner': 0.3254298737537633}


100%|██████████| 6/6 [00:00<00:00, 84.96it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.1380553818713438}


100%|██████████| 6/6 [00:00<00:00, 80.13it/s]
100%|██████████| 6/6 [00:00<00:00, 87.80it/s]

{'ner': 0.007929223374499883}
{'ner': 0.004143117138260166}
Entities in 'Trained completed for EFFECTIVE_DATE entity.'





#### 2.4.b.Signature Date

In [35]:
dates1 = ["31. July 2018"]
dates2 = ["August 30, 2017"]

In [36]:
sig_dates = [dates2]

In [37]:
TRAIN_DATA=[("31. July 2018", {'entities': [(0, len(dates1[0]), 'SIGNATURE_DATE')]})]
start=0
end=len(dates1[0])
for date in sig_dates :
    start=end+1
    end=start+len(date[0])
    TRAIN_DATA.append(    (date[0], { 'entities': [(start, end , 'SIGNATURE_DATE')]}) )

In [38]:
LABEL ="SIGNATURE_DATE"

In [39]:
def extract_sign_dates(model=None, new_model_name='SIGNATURE_DATE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for SIGNATURE_DATE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [40]:
extract_sign_dates()

100%|██████████| 2/2 [00:00<00:00, 45.82it/s]
100%|██████████| 2/2 [00:00<00:00, 72.96it/s]
100%|██████████| 2/2 [00:00<00:00, 72.49it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

Created blank 'en' model
{'ner': 6.118950456380844}
{'ner': 5.597102373838425}
{'ner': 5.010941565036774}


100%|██████████| 2/2 [00:00<00:00, 68.32it/s]
100%|██████████| 2/2 [00:00<00:00, 63.38it/s]
100%|██████████| 2/2 [00:00<00:00, 66.42it/s]
100%|██████████| 2/2 [00:00<00:00, 72.23it/s]
100%|██████████| 2/2 [00:00<00:00, 73.76it/s]
100%|██████████| 2/2 [00:00<00:00, 74.23it/s]
100%|██████████| 2/2 [00:00<00:00, 75.47it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

{'ner': 4.1385601460933685}
{'ner': 3.1759718656539917}
{'ner': 1.9892259165644646}
{'ner': 1.5503272414207458}
{'ner': 1.4738717761356384}
{'ner': 1.5970650845411}
{'ner': 1.4237902216009388}


100%|██████████| 2/2 [00:00<00:00, 69.53it/s]
100%|██████████| 2/2 [00:00<00:00, 74.79it/s]
100%|██████████| 2/2 [00:00<00:00, 79.52it/s]
100%|██████████| 2/2 [00:00<00:00, 78.39it/s]
100%|██████████| 2/2 [00:00<00:00, 79.90it/s]
100%|██████████| 2/2 [00:00<00:00, 77.16it/s]
100%|██████████| 2/2 [00:00<00:00, 76.41it/s]
100%|██████████| 2/2 [00:00<00:00, 78.74it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

{'ner': 1.7820847800477395}
{'ner': 1.8038894381018054}
{'ner': 1.8912404639398375}
{'ner': 1.304378313214137}
{'ner': 1.0880918921028915}
{'ner': 2.438935870666678}
{'ner': 1.5147939166599143}
{'ner': 1.8861417788596058}


100%|██████████| 2/2 [00:00<00:00, 72.88it/s]
100%|██████████| 2/2 [00:00<00:00, 76.33it/s]

{'ner': 1.09727729805414}
{'ner': 4.260099779300696}
Entities in 'Trained completed for SIGNATURE_DATE entity.'





#### 2.4.c.Termination Date

In [41]:

dates1 = ["period of 48 months"]
dates2 = ["36 months"]

#### 2.4.d.Commencement Date

In [42]:
dates1 = ["31.01.2017"]
dates2 = ["31.03.2019"]
dates3 = ["1 October 2018"]
dates4 = ["September 1st, 2017"]


#### 2.4.e.End Date

In [43]:
dates1 = ["31.12.2018"]
dates2 = ["Apr 11th 2023"]
dates3 = ["19.01.2020"]
dates4 = ["July 31"]
dates5 = ["2017"]




### 2.5)- Countries

### 2.6)-CLIENT_CONTRACT_MANAGER

### 2.7)-SUPPLIER_CONTRACT_MANAGER