In [1]:
"""
NER_spaCy
# Adapted from Medium (https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175).
# Implemented  by Haolin Tang
"""
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
dataturks_JSON_FilePath = '/content/drive/My Drive/Entity Recognition in Resumes.json'

Mounted at /content/drive


In [2]:
import numpy as np 
import pandas as pd
import json
import re

**Data Preprocessing**

In [3]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

In [4]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [5]:
data = trim_entity_spans(convert_dataturks_to_spacy(dataturks_JSON_FilePath))

In [6]:
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [7]:
!pip install spacy==2.1.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.1.4
  Downloading spacy-2.1.4-cp37-cp37m-manylinux1_x86_64.whl (29.8 MB)
[K     |████████████████████████████████| 29.8 MB 1.2 MB/s 
Collecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 441 kB/s 
Collecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 66.8 MB/s 
Collecting blis<0.3.0,>=0.2.2
  Downloading blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 69.4 MB/s 
[?25hCollecting srsly<1.1.0,>=0.0.5
  Downloading srsly-1.0.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (208 kB)
[K     |████████████████████████████████| 208 kB 76.7 MB/s 
[?25hCollecting plac<1.0.0,>=0.9.6
  Downloading plac-0.9.6-py2.py3-none-any

**Split Dataset**

In [8]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set


train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)


**Training SpaCy**

In [9]:
import spacy

def train_spacy():
    
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
        
    # add labels
    for _, annotations in train_data:
         for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [10]:
nlp = train_spacy()

Statring iteration 0
{'ner': 24234.051309814662}
Statring iteration 1
{'ner': 19230.048606677243}
Statring iteration 2
{'ner': 14368.234976497673}
Statring iteration 3
{'ner': 13835.972596655893}
Statring iteration 4
{'ner': 12513.459990561973}
Statring iteration 5
{'ner': 10841.755567976837}
Statring iteration 6
{'ner': 10298.993076364999}
Statring iteration 7
{'ner': 9706.967570654651}
Statring iteration 8
{'ner': 9387.009082304934}
Statring iteration 9
{'ner': 9948.62753915907}


**Evaluation**

In [11]:
from spacy.gold import GoldParse
from itertools import groupby

def doc_to_bilou(nlp, text):
    
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        end = last + len(word)
        
        entities.append((
                start,
                end,
                entity
            ))

    gold = GoldParse(nlp(text), entities = entities)
    pred_ents = gold.ner
    
    return pred_ents

y_test = []
y_pred = []

for text, annots in test_data:
    
    gold = GoldParse(nlp.make_doc(text), entities = annots.get("entities"))
    ents = gold.ner
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)
    
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)
    
report, accuracy = ner_report(y_test, y_pred)
print(report)

                       precision    recall  f1-score   support

                    -       0.00      0.00      0.00       142
       B-College Name       0.79      0.84      0.82        32
       I-College Name       0.73      0.87      0.80        63
       L-College Name       0.74      0.78      0.76        32
       U-College Name       0.00      0.00      0.00         1
B-Companies worked at       0.59      0.43      0.50        30
I-Companies worked at       0.20      0.25      0.22         4
L-Companies worked at       0.64      0.45      0.53        31
U-Companies worked at       0.50      0.20      0.29        45
             B-Degree       0.87      0.83      0.85        24
             I-Degree       0.97      0.92      0.95        66
             L-Degree       0.87      0.83      0.85        24
             U-Degree       0.50      0.67      0.57         3
        B-Designation       0.83      0.53      0.65        45
        I-Designation       0.95      0.50      0.66  

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print(accuracy)

0.9180339314389444


In [13]:
!pip install seqeval==0.0.12

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval==0.0.12
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7434 sha256=6596182dbbf1764ec0bd0aad0fee3ec9a454c88c3777b63d31aa468342e8932e
  Stored in directory: /root/.cache/pip/wheels/dc/cc/62/a3b81f92d35a80e39eb9b2a9d8b31abac54c02b21b2d466edc
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [16]:
from seqeval.metrics import classification_report

In [17]:
print(classification_report(y_test, y_pred,digits=4))

                     precision    recall  f1-score   support

             Skills     0.6667    0.3448    0.4545        29
           Location     0.8571    0.3429    0.4898        35
        Designation     0.8000    0.5000    0.6154        48
      Email Address     0.7619    0.9412    0.8421        17
               Name     0.8636    0.8636    0.8636        22
             Degree     0.8148    0.8148    0.8148        27
Companies worked at     0.5750    0.3067    0.4000        75
    Graduation Year     0.7000    0.3182    0.4375        22
       College Name     0.7143    0.7576    0.7353        33
                        0.0000    0.0000    0.0000        12
Years of Experience     0.5000    0.2000    0.2857         5

          micro avg     0.7361    0.4892    0.5878       325
          macro avg     0.6963    0.4892    0.5554       325

