In [1]:
"""
NER_CRF
# Adapted from Medium (https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175).
# Implemented  by Lavanya Thollamadugu
"""

from google.colab import drive
drive.mount('/content/drive')

dataturks_JSON_FilePath = '/content/drive/My Drive/Entity Recognition in Resumes.json'

import numpy as np 
import pandas as pd
import json
import re

Mounted at /content/drive


**Cleaning Entities**

In [2]:
# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [3]:
data = trim_entity_spans(convert_dataturks_to_spacy(dataturks_JSON_FilePath))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [4]:
!pip install spacy==2.1.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.1.4
  Downloading spacy-2.1.4-cp37-cp37m-manylinux1_x86_64.whl (29.8 MB)
[K     |████████████████████████████████| 29.8 MB 1.5 MB/s 
Collecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 367 kB/s 
[?25hCollecting jsonschema<3.1.0,>=2.6.0
  Downloading jsonschema-3.0.2-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.0 MB/s 
[?25hCollecting srsly<1.1.0,>=0.0.5
  Downloading srsly-1.0.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (208 kB)
[K     |████████████████████████████████| 208 kB 55.0 MB/s 
[?25hCollecting plac<1.0.0,>=0.9.6
  Downloading plac-0.9.6-py2.py3-none-any.whl (20 kB)
Collecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.

**Entity Mapping**

In [5]:
from spacy.lang.en import English  # Or whichever language you need
from spacy.gold import biluo_tags_from_offsets

def bilou_tags(data):
    
    docs  = []
    annots = []
    nlp = English()
    for text, annotations in data:
        offsets = annotations["entities"]
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, offsets)
        for i in range(len(tags)):
            if tags[i].startswith("U"):
                tags[i] = "B" + tags[i][1:]
            elif tags[i].startswith("L"):
                tags[i] = "I" + tags[i][1:]
            if not (doc[i].text.isalnum() or len(doc[i].text) > 1):
                tags[i] = "O"
        docs.append([token.text for token in doc])
        annots.append(tags)
        
    df_data = pd.DataFrame({'docs': docs, 'annots': annots})

    return df_data

df_data = bilou_tags(data)

**Remove mislabeled examples**

In [6]:
for i in range(len(df_data)):
    if "-" in df_data.loc[i, "annots"]:
        df_data.drop(i, axis = "index", inplace = True)
df_data.reset_index(inplace = True)

In [7]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
from nltk import pos_tag
sentences = [[(w, p, t) for w, p, t in zip(df_data["docs"][i], [y for x, y in pos_tag(df_data["docs"][i])], df_data["annots"][i]) if w.isalnum() or len(w) > 1] for i in range(0, len(df_data))]

**Feature extraction**

In [9]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2]
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2]
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

**Split dataset**

In [10]:
datasetfrom sklearn.model_selection import train_test_split

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [11]:
!pip install python-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 4.6 MB/s 
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.8


**Load model**

In [12]:
import pycrfsuite

trainer = pycrfsuite.Trainer(verbose = True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [13]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [14]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [15]:
trainer.train('resume-ner.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 31463
Seconds required: 0.147

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 66286.272148
Feature norm: 1.000000
Error norm: 52719.111470
Active features: 16326
Line search trials: 1
Line search step: 0.000014
Seconds required for this iteration: 0.249

***** Iteration #2 *****
Loss: 61490.778898
Feature norm: 4.829755
Error norm: 18396.795414
Active features: 15796
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.133

***** Iteration #3 *****
Loss: 48301.118654
Feature norm: 4.246567
Error norm: 18344.349561
Active features: 13507
Line search trials: 1
Line search step: 1.000000
Seconds required for

In [16]:
trainer.logparser.last_iteration

{'num': 100,
 'scores': {},
 'loss': 3305.627613,
 'feature_norm': 44.808218,
 'error_norm': 454.597723,
 'active_features': 3160,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.122}

In [17]:
tagger = pycrfsuite.Tagger()
tagger.open('./resume-ner.crfsuite')

<contextlib.closing at 0x7f29f0793f90>

**Evaluation**

In [18]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)

In [19]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [20]:
report, accuracy = ner_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(report)

                       precision    recall  f1-score   support

       B-College Name       0.60      0.46      0.52        13
       I-College Name       0.64      0.74      0.68        19
B-Companies worked at       0.75      0.44      0.56        34
I-Companies worked at       0.25      0.11      0.15         9
             B-Degree       0.57      0.57      0.57         7
             I-Degree       0.40      0.67      0.50        12
        B-Designation       1.00      0.56      0.71        27
        I-Designation       0.96      0.56      0.71        39
      B-Email Address       0.82      1.00      0.90         9
      I-Email Address       0.83      1.00      0.91        15
    B-Graduation Year       0.50      0.14      0.22        14
           B-Location       0.64      0.33      0.44        21
           I-Location       0.00      0.00      0.00         4
               B-Name       1.00      1.00      1.00        11
               I-Name       1.00      0.92      0.96  

In [22]:
print(accuracy)

0.9068273092369478


In [23]:
!pip install seqeval==0.0.12

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval==0.0.12
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7434 sha256=a2133c977f30c8935a0b715de277f3992c63fed77788bdaec31b3a9a986fbb0c
  Stored in directory: /root/.cache/pip/wheels/dc/cc/62/a3b81f92d35a80e39eb9b2a9d8b31abac54c02b21b2d466edc
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [24]:
from seqeval.metrics import classification_report

In [25]:
print(classification_report(y_test, y_pred,digits=4))

                     precision    recall  f1-score   support

        Designation     0.9333    0.5185    0.6667        27
           Location     0.6364    0.3333    0.4375        21
Companies worked at     0.7500    0.4412    0.5556        34
    Graduation Year     0.5000    0.1429    0.2222        14
             Skills     0.5714    0.2857    0.3810        14
       College Name     0.5000    0.3846    0.4348        13
               Name     0.9091    0.9091    0.9091        11
             Degree     0.4286    0.4286    0.4286         7
      Email Address     0.8182    1.0000    0.9000         9
Years of Experience     0.0000    0.0000    0.0000         2

          micro avg     0.7113    0.4539    0.5542       152
          macro avg     0.6969    0.4539    0.5347       152

