<a href="https://colab.research.google.com/github/Keerthu8999/GoogleColab/blob/main/NER_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Packages

In [None]:
!pip install plac

In [3]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [None]:
!python3 -m spacy download 'en_core_web_lg'

In [6]:
nlp1 = spacy.load('en_core_web_lg')

## Working of NER

In [7]:
docx1 = nlp1(u"Who is Nishanth?")

In [8]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Nishanth 7 15 PERSON


In [9]:
docx2 = nlp1(u"Who is Kamal Khumar?")

In [10]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Kamal Khumar 7 19 PERSON


## Train Data

In [11]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

## Define our variables

In [12]:
model = None
output_dir=Path("/content/sample_data")
n_iter=100

## Load the model

In [13]:
if model is not None:
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [16]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

## Train the Recognizer

In [20]:
from spacy.training import Example
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

examples = []

for text, annotations in TRAIN_DATA:
    doc = nlp(text)
    examples.append(Example.from_dict(doc, annotations))
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for example in tqdm(examples):
            nlp.update(
                [example],
                drop=0.5,
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 43.26it/s]


{'ner': 13.196353912353516}


100%|██████████| 3/3 [00:00<00:00, 46.94it/s]


{'ner': 12.018358170986176}


100%|██████████| 3/3 [00:00<00:00, 54.27it/s]


{'ner': 10.887681424617767}


100%|██████████| 3/3 [00:00<00:00, 49.51it/s]


{'ner': 9.932930797338486}


100%|██████████| 3/3 [00:00<00:00, 42.58it/s]


{'ner': 7.613298177719116}


100%|██████████| 3/3 [00:00<00:00, 51.47it/s]


{'ner': 6.885888755321503}


100%|██████████| 3/3 [00:00<00:00, 48.52it/s]


{'ner': 5.704853501170874}


100%|██████████| 3/3 [00:00<00:00, 53.46it/s]


{'ner': 6.359439414925873}


100%|██████████| 3/3 [00:00<00:00, 52.80it/s]


{'ner': 6.164544252445921}


100%|██████████| 3/3 [00:00<00:00, 51.46it/s]


{'ner': 5.720086422050372}


100%|██████████| 3/3 [00:00<00:00, 45.90it/s]


{'ner': 5.632585215898871}


100%|██████████| 3/3 [00:00<00:00, 51.15it/s]


{'ner': 4.403370333602652}


100%|██████████| 3/3 [00:00<00:00, 51.26it/s]


{'ner': 4.593773083033739}


100%|██████████| 3/3 [00:00<00:00, 51.95it/s]


{'ner': 4.388311261915078}


100%|██████████| 3/3 [00:00<00:00, 45.86it/s]


{'ner': 3.462013615266187}


100%|██████████| 3/3 [00:00<00:00, 38.02it/s]


{'ner': 3.5901187916260824}


100%|██████████| 3/3 [00:00<00:00, 38.47it/s]


{'ner': 6.300942811263667}


100%|██████████| 3/3 [00:00<00:00, 35.08it/s]


{'ner': 2.7440529965097085}


100%|██████████| 3/3 [00:00<00:00, 33.67it/s]


{'ner': 3.95179954762898}


100%|██████████| 3/3 [00:00<00:00, 32.77it/s]


{'ner': 4.900296560637685}


100%|██████████| 3/3 [00:00<00:00, 33.24it/s]


{'ner': 2.53536700446449}


100%|██████████| 3/3 [00:00<00:00, 32.11it/s]


{'ner': 3.862013238154759}


100%|██████████| 3/3 [00:00<00:00, 35.10it/s]


{'ner': 3.8219256410586127}


100%|██████████| 3/3 [00:00<00:00, 41.38it/s]


{'ner': 3.4136461601830206}


100%|██████████| 3/3 [00:00<00:00, 37.87it/s]


{'ner': 2.367449880161587}


100%|██████████| 3/3 [00:00<00:00, 38.07it/s]


{'ner': 3.203577603920312}


100%|██████████| 3/3 [00:00<00:00, 41.26it/s]


{'ner': 1.7648925292094582}


100%|██████████| 3/3 [00:00<00:00, 39.31it/s]


{'ner': 1.1607243490244383}


100%|██████████| 3/3 [00:00<00:00, 40.04it/s]


{'ner': 1.2705905949846872}


100%|██████████| 3/3 [00:00<00:00, 34.82it/s]


{'ner': 0.7421344211322012}


100%|██████████| 3/3 [00:00<00:00, 40.28it/s]


{'ner': 0.7245197097500835}


100%|██████████| 3/3 [00:00<00:00, 35.87it/s]


{'ner': 1.498490847186111}


100%|██████████| 3/3 [00:00<00:00, 39.88it/s]


{'ner': 0.45604244191235344}


100%|██████████| 3/3 [00:00<00:00, 38.87it/s]


{'ner': 0.17323788257137507}


100%|██████████| 3/3 [00:00<00:00, 40.36it/s]


{'ner': 0.30501856559844764}


100%|██████████| 3/3 [00:00<00:00, 38.04it/s]


{'ner': 0.24261466003560933}


100%|██████████| 3/3 [00:00<00:00, 40.96it/s]


{'ner': 0.10986089426410732}


100%|██████████| 3/3 [00:00<00:00, 37.36it/s]


{'ner': 0.45015951451964614}


100%|██████████| 3/3 [00:00<00:00, 35.01it/s]


{'ner': 0.36553726786311747}


100%|██████████| 3/3 [00:00<00:00, 32.15it/s]


{'ner': 0.005388276200931934}


100%|██████████| 3/3 [00:00<00:00, 35.65it/s]


{'ner': 0.0009651849502661597}


100%|██████████| 3/3 [00:00<00:00, 32.56it/s]


{'ner': 0.07960649747298759}


100%|██████████| 3/3 [00:00<00:00, 33.61it/s]


{'ner': 0.020081194222185246}


100%|██████████| 3/3 [00:00<00:00, 35.72it/s]


{'ner': 0.000906082051608514}


100%|██████████| 3/3 [00:00<00:00, 35.61it/s]


{'ner': 0.00029114285116695553}


100%|██████████| 3/3 [00:00<00:00, 35.10it/s]


{'ner': 2.2121239238548748e-05}


100%|██████████| 3/3 [00:00<00:00, 33.63it/s]


{'ner': 0.10867541281342999}


100%|██████████| 3/3 [00:00<00:00, 37.50it/s]


{'ner': 0.005100746248540447}


100%|██████████| 3/3 [00:00<00:00, 37.75it/s]


{'ner': 0.07177975507462334}


100%|██████████| 3/3 [00:00<00:00, 36.95it/s]


{'ner': 6.022632070414909e-05}


100%|██████████| 3/3 [00:00<00:00, 36.14it/s]


{'ner': 0.00028209885120468784}


100%|██████████| 3/3 [00:00<00:00, 37.16it/s]


{'ner': 0.09103488472142476}


100%|██████████| 3/3 [00:00<00:00, 37.73it/s]


{'ner': 8.728484067260127e-06}


100%|██████████| 3/3 [00:00<00:00, 49.08it/s]


{'ner': 6.3068511963420185e-06}


100%|██████████| 3/3 [00:00<00:00, 55.90it/s]


{'ner': 0.0004567038225558972}


100%|██████████| 3/3 [00:00<00:00, 50.39it/s]


{'ner': 2.7748023613493603e-05}


100%|██████████| 3/3 [00:00<00:00, 52.73it/s]


{'ner': 3.572502493328602e-05}


100%|██████████| 3/3 [00:00<00:00, 51.96it/s]


{'ner': 4.3324276372806445e-06}


100%|██████████| 3/3 [00:00<00:00, 53.98it/s]


{'ner': 1.052863991806761e-05}


100%|██████████| 3/3 [00:00<00:00, 48.45it/s]


{'ner': 1.4888708333605366e-07}


100%|██████████| 3/3 [00:00<00:00, 51.42it/s]


{'ner': 5.01628203227864e-05}


100%|██████████| 3/3 [00:00<00:00, 52.87it/s]


{'ner': 2.0378322729164844e-05}


100%|██████████| 3/3 [00:00<00:00, 52.10it/s]


{'ner': 4.327352020016625e-05}


100%|██████████| 3/3 [00:00<00:00, 51.86it/s]


{'ner': 1.4271790378376031e-07}


100%|██████████| 3/3 [00:00<00:00, 49.34it/s]


{'ner': 1.4711201527672829e-05}


100%|██████████| 3/3 [00:00<00:00, 51.28it/s]


{'ner': 5.350830392097896e-06}


100%|██████████| 3/3 [00:00<00:00, 50.39it/s]


{'ner': 3.0261002312422803e-05}


100%|██████████| 3/3 [00:00<00:00, 44.80it/s]


{'ner': 0.0003565802402219625}


100%|██████████| 3/3 [00:00<00:00, 48.82it/s]


{'ner': 4.9042666725362185e-06}


100%|██████████| 3/3 [00:00<00:00, 49.13it/s]


{'ner': 7.235449585113681e-06}


100%|██████████| 3/3 [00:00<00:00, 49.75it/s]


{'ner': 4.043308778841283e-08}


100%|██████████| 3/3 [00:00<00:00, 50.38it/s]


{'ner': 4.946999414498172e-05}


100%|██████████| 3/3 [00:00<00:00, 54.86it/s]


{'ner': 0.00013067821412752166}


100%|██████████| 3/3 [00:00<00:00, 49.00it/s]


{'ner': 0.0007225680792357757}


100%|██████████| 3/3 [00:00<00:00, 47.91it/s]


{'ner': 0.010214660669192563}


100%|██████████| 3/3 [00:00<00:00, 49.03it/s]


{'ner': 4.0982247896902064e-08}


100%|██████████| 3/3 [00:00<00:00, 49.12it/s]


{'ner': 6.226735930464284e-06}


100%|██████████| 3/3 [00:00<00:00, 53.55it/s]


{'ner': 5.559016676919162e-07}


100%|██████████| 3/3 [00:00<00:00, 47.77it/s]


{'ner': 8.342409585125545e-08}


100%|██████████| 3/3 [00:00<00:00, 52.22it/s]


{'ner': 8.690059013942435e-06}


100%|██████████| 3/3 [00:00<00:00, 52.64it/s]


{'ner': 4.405256282048046e-07}


100%|██████████| 3/3 [00:00<00:00, 55.69it/s]


{'ner': 0.17000128817249183}


100%|██████████| 3/3 [00:00<00:00, 45.87it/s]


{'ner': 2.381237591879304e-07}


100%|██████████| 3/3 [00:00<00:00, 46.17it/s]


{'ner': 4.5745310603824936e-07}


100%|██████████| 3/3 [00:00<00:00, 50.49it/s]


{'ner': 0.0012235043479647313}


100%|██████████| 3/3 [00:00<00:00, 52.21it/s]


{'ner': 2.0131850322782995e-05}


100%|██████████| 3/3 [00:00<00:00, 49.85it/s]


{'ner': 1.5285132634268517}


100%|██████████| 3/3 [00:00<00:00, 47.51it/s]


{'ner': 1.1672771168529372e-08}


100%|██████████| 3/3 [00:00<00:00, 52.12it/s]


{'ner': 0.051032694221333205}


100%|██████████| 3/3 [00:00<00:00, 53.04it/s]


{'ner': 2.8453016154402095e-10}


100%|██████████| 3/3 [00:00<00:00, 50.14it/s]


{'ner': 4.722941543221593e-08}


100%|██████████| 3/3 [00:00<00:00, 51.98it/s]


{'ner': 6.227952497644839e-06}


100%|██████████| 3/3 [00:00<00:00, 49.93it/s]


{'ner': 2.9552435323185207}


100%|██████████| 3/3 [00:00<00:00, 49.07it/s]


{'ner': 2.9966935603981163e-09}


100%|██████████| 3/3 [00:00<00:00, 56.95it/s]


{'ner': 4.14613232560083e-06}


100%|██████████| 3/3 [00:00<00:00, 49.58it/s]


{'ner': 5.3292699506674404e-05}


100%|██████████| 3/3 [00:00<00:00, 54.30it/s]


{'ner': 1.0927703629287221e-08}


100%|██████████| 3/3 [00:00<00:00, 49.38it/s]


{'ner': 5.258758478440095e-08}


100%|██████████| 3/3 [00:00<00:00, 41.57it/s]


{'ner': 6.868119228953407e-05}


100%|██████████| 3/3 [00:00<00:00, 49.78it/s]

{'ner': 4.1909075808095314e-07}





## Test the trained model

In [21]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities [('Nishanth', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Nishanth', 'PERSON', 3), ('?', '', 2)]
Entities [('Kamal Khumar', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kamal', 'PERSON', 3), ('Khumar', 'PERSON', 1), ('?', '', 2)]


## Save the model

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to C:\Users\nithi\Documents\ner


## Test the saved model

In [None]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from C:\Users\nithi\Documents\ner
Entities [('Kamal Khumar', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kamal', 'PERSON', 3), ('Khumar', 'PERSON', 1), ('?', '', 2)]
Entities [('Nishanth', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Nishanth', 'PERSON', 3), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
