# Training Spacy Model

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
import re
import string
import pandas as pd
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

# 2)-Training Entities

### 2.1)- Training Titles

In [5]:
title1  = ["Agreement on Managed Data Center Services"]
title2  = ["Master Services Agreement on the Provision of IT Services"]
title3  = ["Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)"]
title4  = ["MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS"]
title5  = ["Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds"]
title6  = ["Agreement on the Provision of MANAGED PRINT Services"]
title7  = ["Agreement on the Provision of MPS (Managed Print Services)"]
title8  = ["Agreement for Security Operation Center Services"]
title9  = ["AGREEMENT ON PROVISIONING OF IT AND COMMUNICATION SERVICES"]
title10 = ["Agreement on Managed Data Center Services"]
title11 = ["Master Project, Support and Maintenance Agreement"]
title12 = ["ENTERPRISE CUSTOMER AGREEMENT"]
title13 = ["AGREEMENT on the provision of managed Mobile communication Services"]
title14 = ["MASTER SERVICE AGREEMENT"]
title15 = ["Agreement for Security Operation Center Services"]

In [6]:
Titles = [title2,title2, title3, title4, title5, title6, title7, title8, title9, title10, title11, title12, title13, title14,title15]

In [7]:
TRAIN_DATA=[('Agreement on Managed Data Center Services', {'entities': [(0, len(title1[0]), 'TITLE')]})]
start=0
end=len(title1[0])
for title in Titles:
    start=end+1
    end=start+len(title[0])
    TRAIN_DATA.append(    (title[0], { 'entities': [(start, end , 'TITLE')]}) )   

In [8]:
print(TRAIN_DATA)

[('Agreement on Managed Data Center Services', {'entities': [(0, 41, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(42, 99, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services', {'entities': [(100, 157, 'TITLE')]}), ('Master Services Agreement on the Provision of IT Services (“Agreement“ or “Master Services Agreement”)', {'entities': [(158, 260, 'TITLE')]}), ('MASTER SERVICES AGREEMENT ON THE PROVISION OF MANAGED SERVICES IN PUBLIC COULDS', {'entities': [(261, 340, 'TITLE')]}), ('Master Services Agreement (“Agreement“ or “Master Services Agreement”) on the provision of Managed Services in Public Clouds', {'entities': [(341, 465, 'TITLE')]}), ('Agreement on the Provision of MANAGED PRINT Services', {'entities': [(466, 518, 'TITLE')]}), ('Agreement on the Provision of MPS (Managed Print Services)', {'entities': [(519, 577, 'TITLE')]}), ('Agreement for Security Operation Center Services', {'entities': [(578, 626, 'TITLE')]}),

# Start training Title Entity

In [9]:
# Define our variables and model path to be laoded
model = None
output_dir=Path("/Users/hassansherwani/Documents/Python/Spacy")
n_iter=100

In [10]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [12]:
# new entity label
LABEL = 'TITLE'

In [13]:
def extract_title(model=None, new_model_name='TITLE', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for TITLE entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [14]:
# Run our Function
extract_title()

Created blank 'en' model


100%|██████████| 16/16 [00:00<00:00, 64.66it/s]
 50%|█████     | 8/16 [00:00<00:00, 75.79it/s]

{'ner': 48.63483965834348}


100%|██████████| 16/16 [00:00<00:00, 73.56it/s]
 50%|█████     | 8/16 [00:00<00:00, 68.24it/s]

{'ner': 2.051892207011676}


100%|██████████| 16/16 [00:00<00:00, 73.10it/s]
 50%|█████     | 8/16 [00:00<00:00, 76.41it/s]

{'ner': 2.0000000000340945}


100%|██████████| 16/16 [00:00<00:00, 71.28it/s]
 50%|█████     | 8/16 [00:00<00:00, 72.30it/s]

{'ner': 1.9999794967549442}


100%|██████████| 16/16 [00:00<00:00, 72.97it/s]
 50%|█████     | 8/16 [00:00<00:00, 71.79it/s]

{'ner': 1.997525052390797}


100%|██████████| 16/16 [00:00<00:00, 73.17it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.95it/s]

{'ner': 1.9483657764528084}


100%|██████████| 16/16 [00:00<00:00, 74.51it/s]
 50%|█████     | 8/16 [00:00<00:00, 72.05it/s]

{'ner': 2.0091208148336785}


100%|██████████| 16/16 [00:00<00:00, 72.40it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.52it/s]

{'ner': 1.9999301641130678}


100%|██████████| 16/16 [00:00<00:00, 73.96it/s]
 50%|█████     | 8/16 [00:00<00:00, 74.53it/s]

{'ner': 1.9948113578549713}


100%|██████████| 16/16 [00:00<00:00, 74.54it/s]
 44%|████▍     | 7/16 [00:00<00:00, 62.92it/s]

{'ner': 1.8600147028558751}


100%|██████████| 16/16 [00:00<00:00, 72.02it/s]
 44%|████▍     | 7/16 [00:00<00:00, 66.35it/s]

{'ner': 2.0000005054719137}


100%|██████████| 16/16 [00:00<00:00, 70.86it/s]
 44%|████▍     | 7/16 [00:00<00:00, 62.85it/s]

{'ner': 2.0019152115325265}


100%|██████████| 16/16 [00:00<00:00, 66.38it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.71it/s]

{'ner': 1.9999045421196264}


100%|██████████| 16/16 [00:00<00:00, 70.87it/s]
 38%|███▊      | 6/16 [00:00<00:00, 59.82it/s]

{'ner': 1.157445248082422}


100%|██████████| 16/16 [00:00<00:00, 64.37it/s]
 44%|████▍     | 7/16 [00:00<00:00, 69.43it/s]

{'ner': 9.427282010520871}


100%|██████████| 16/16 [00:00<00:00, 69.09it/s]
 44%|████▍     | 7/16 [00:00<00:00, 59.22it/s]

{'ner': 2.508942544832952}


100%|██████████| 16/16 [00:00<00:00, 64.78it/s]
 50%|█████     | 8/16 [00:00<00:00, 76.26it/s]

{'ner': 5.669197597228042}


100%|██████████| 16/16 [00:00<00:00, 72.41it/s]
 50%|█████     | 8/16 [00:00<00:00, 72.92it/s]

{'ner': 3.920159740889542}


100%|██████████| 16/16 [00:00<00:00, 71.56it/s]
 50%|█████     | 8/16 [00:00<00:00, 73.54it/s]

{'ner': 10.518484897357597}


100%|██████████| 16/16 [00:00<00:00, 72.60it/s]

{'ner': 4.345709078511231}
Entities in 'Trained completed for TITLE entity.'





### 2.2)- Training Supplier Entity

In [15]:
suppliers1 = ["TEASYS"]
suppliers2 = ["Teasys"]
suppliers3 = ["TEASYS GLOBAL INVEST AG"]
suppliers4 = ["Teasys Global Invest AG"]
suppliers5 = ["teasys global invest ag"]
suppliers6 = ["FTP"]
suppliers7 = ["FTP Deutschland GmbH"]
suppliers8 = ["FTP Deutschland GmbH"]
suppliers9 = ["Wisniewski & Sohn GmbH"]
suppliers10 = ["FBS"]
suppliers11 = ["Horizon Deutschland AG"]
suppliers12 = ["Horizon"]
suppliers13 = ["Harpe"]
suppliers14 = ["Harpe Deutschland GmbH"]
suppliers15 = ["ADVENTURE SERVICES GMBH"]
suppliers16 = ["Adventure Services GmbH"]
suppliers17 = ["SWIPERO LIMITED"]
suppliers18 = ["Swipero Limited"]
suppliers19 = ["Swipero"]
suppliers20 = ["Nozama Net Service"]
suppliers21 = ["NOZAMA NET SERVICE"]
suppliers22 = ["Schwyz Mail Solutions GmbH"]
suppliers23 = ["Verizon Deutschland GmbH"]


In [16]:
suppliers = [suppliers2,suppliers3,suppliers4,suppliers5,suppliers6,suppliers7,suppliers8,
            suppliers9,suppliers10,suppliers11,suppliers12,suppliers13,suppliers14,suppliers15,suppliers16,
            suppliers17,suppliers18,suppliers19, suppliers20,suppliers21,suppliers22,suppliers23]

In [17]:
TRAIN_DATA=[("TEASYS", {'entities': [(0, len(suppliers1[0]), 'SUPPLIER')]})]
start=0
end=len(suppliers1[0])
for supplier in suppliers:
    start=end+1
    end=start+len(supplier[0])
    TRAIN_DATA.append(    (supplier[0], { 'entities': [(start, end , 'SUPPLIER')]}) ) 

In [18]:
print(TRAIN_DATA)

[('TEASYS', {'entities': [(0, 6, 'SUPPLIER')]}), ('Teasys', {'entities': [(7, 13, 'SUPPLIER')]}), ('TEASYS GLOBAL INVEST AG', {'entities': [(14, 37, 'SUPPLIER')]}), ('Teasys Global Invest AG', {'entities': [(38, 61, 'SUPPLIER')]}), ('teasys global invest ag', {'entities': [(62, 85, 'SUPPLIER')]}), ('FTP', {'entities': [(86, 89, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(90, 110, 'SUPPLIER')]}), ('FTP Deutschland GmbH', {'entities': [(111, 131, 'SUPPLIER')]}), ('Wisniewski & Sohn GmbH', {'entities': [(132, 154, 'SUPPLIER')]}), ('FBS', {'entities': [(155, 158, 'SUPPLIER')]}), ('Horizon Deutschland AG', {'entities': [(159, 181, 'SUPPLIER')]}), ('Horizon', {'entities': [(182, 189, 'SUPPLIER')]}), ('Harpe', {'entities': [(190, 195, 'SUPPLIER')]}), ('Harpe Deutschland GmbH', {'entities': [(196, 218, 'SUPPLIER')]}), ('ADVENTURE SERVICES GMBH', {'entities': [(219, 242, 'SUPPLIER')]}), ('Adventure Services GmbH', {'entities': [(243, 266, 'SUPPLIER')]}), ('SWIPERO LIMITED', {'entiti

In [19]:
# new entity label
LABEL = 'SUPPLIER'

In [20]:
def extract_supplier(model=None, new_model_name='SUPPLIER', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # Training completion, saving and ready to be loaded for future use
    show_text = 'Trained completed for SUPPLIER entity.'
    doc = nlp(show_text)
    print("Entities in '%s'" % show_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [21]:
# Run our Function
extract_supplier()

 35%|███▍      | 8/23 [00:00<00:00, 77.51it/s]

Created blank 'en' model


100%|██████████| 23/23 [00:00<00:00, 87.91it/s]
 43%|████▎     | 10/23 [00:00<00:00, 99.19it/s]

{'ner': 19.770881283067297}


100%|██████████| 23/23 [00:00<00:00, 97.98it/s]
 43%|████▎     | 10/23 [00:00<00:00, 99.36it/s]

{'ner': 2.0002631139763456}


100%|██████████| 23/23 [00:00<00:00, 98.26it/s] 
 48%|████▊     | 11/23 [00:00<00:00, 100.04it/s]

{'ner': 1.9998337712961962}


100%|██████████| 23/23 [00:00<00:00, 98.26it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 95.28it/s]

{'ner': 1.9639298122464597}


100%|██████████| 23/23 [00:00<00:00, 93.88it/s]
 39%|███▉      | 9/23 [00:00<00:00, 81.54it/s]

{'ner': 2.012278264123495}


100%|██████████| 23/23 [00:00<00:00, 87.99it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.42it/s]

{'ner': 1.9981881155025059}


100%|██████████| 23/23 [00:00<00:00, 96.98it/s]
 48%|████▊     | 11/23 [00:00<00:00, 102.27it/s]

{'ner': 0.9920426615018303}


100%|██████████| 23/23 [00:00<00:00, 99.06it/s] 
 39%|███▉      | 9/23 [00:00<00:00, 87.16it/s]

{'ner': 0.8749774103396961}


100%|██████████| 23/23 [00:00<00:00, 91.73it/s]
 48%|████▊     | 11/23 [00:00<00:00, 100.75it/s]

{'ner': 1.6239382162807654}


100%|██████████| 23/23 [00:00<00:00, 96.97it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 96.05it/s]

{'ner': 1.1270397774965486}


100%|██████████| 23/23 [00:00<00:00, 95.56it/s]
 43%|████▎     | 10/23 [00:00<00:00, 97.00it/s]

{'ner': 0.06379693039089386}


100%|██████████| 23/23 [00:00<00:00, 96.35it/s]
 39%|███▉      | 9/23 [00:00<00:00, 84.38it/s]

{'ner': 0.25373492926097824}


100%|██████████| 23/23 [00:00<00:00, 90.00it/s]
 43%|████▎     | 10/23 [00:00<00:00, 96.26it/s]

{'ner': 2.196645966320689e-06}


100%|██████████| 23/23 [00:00<00:00, 97.41it/s]
 43%|████▎     | 10/23 [00:00<00:00, 99.63it/s]

{'ner': 0.01698209200679633}


100%|██████████| 23/23 [00:00<00:00, 99.07it/s] 
 43%|████▎     | 10/23 [00:00<00:00, 99.21it/s]

{'ner': 9.021845089339998e-05}


100%|██████████| 23/23 [00:00<00:00, 97.80it/s]
 43%|████▎     | 10/23 [00:00<00:00, 95.97it/s]

{'ner': 1.3099066399556012e-05}


100%|██████████| 23/23 [00:00<00:00, 97.85it/s]
 43%|████▎     | 10/23 [00:00<00:00, 98.99it/s]

{'ner': 6.791446810035508e-05}


100%|██████████| 23/23 [00:00<00:00, 98.19it/s]
 43%|████▎     | 10/23 [00:00<00:00, 94.97it/s]

{'ner': 2.1261541633575448e-08}


100%|██████████| 23/23 [00:00<00:00, 92.95it/s]
 39%|███▉      | 9/23 [00:00<00:00, 89.91it/s]

{'ner': 1.8915495569555992e-07}


100%|██████████| 23/23 [00:00<00:00, 88.20it/s]

{'ner': 1.9086321498697e-06}
Entities in 'Trained completed for SUPPLIER entity.'





### 2.3)- Training Client Entity

In [None]:
clients1 = ["F.UN", "", "", ""]
clients2 = ["FUN"]
clients3 = ["F.UN BUSINESS SERVICES GMBH"]
clients4 = [""]
clients5 = [""]