In [17]:
import spacy
import pickle
import random
import json
import time
from functools import wraps
import os

In [18]:
print(spacy.__version__)

2.2.4


In [19]:
EPOCH = 10
DROPOUT = 0.2

CUSTOM_NER_MODEL_PATH = "../models/ner/resume_ner_model"
NER_DATASET_PATH = "../dataset/ner"

In [20]:
def duration(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Function {func.__name__} took {duration:.4f} seconds")
        return result

    return wrapper

In [21]:
train_data1 = pickle.load(open(os.path.join(NER_DATASET_PATH, "train_data.pkl"), "rb"))

with open(os.path.join(NER_DATASET_PATH, "train_data.json"), "r") as f:
    data = json.load(f)

In [22]:
train_data = train_data1
train_data.extend(data)
print(f"Training data consist of {len(train_data)} manually labelled resume's.")

Training data consist of 400 manually labelled resume's.


In [23]:
training_data = []

for d in train_data:
    temp_dict = {}
    temp_dict["text"] = d[0].strip()
    temp_dict["entities"] = []
    for entities in d[1]["entities"]:
        start = entities[0]
        end = entities[1]
        label = entities[2].upper().strip().replace(" ", "_")
        temp_dict["entities"].append((start, end, label))
    training_data.append(temp_dict)

print(training_data[0])

{'text': 'Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?i

In [24]:
for ent in train_data[9][1]["entities"]:
    print(ent)

(4048, 4169, 'Skills')
(4034, 4038, 'Graduation Year')
(3977, 4022, 'College Name')
(3970, 3975, 'Degree')
(3841, 3850, 'Location')
(3822, 3837, 'Companies worked at')
(3804, 3820, 'Designation')
(3755, 3764, 'Location')
(3736, 3751, 'Companies worked at')
(3721, 3734, 'Designation')
(2053, 2062, 'Location')
(2034, 2049, 'Companies worked at')
(2023, 2032, 'Designation')
(493, 497, 'Graduation Year')
(452, 467, 'Companies worked at')
(217, 226, 'Location')
(198, 213, 'Companies worked at')
(187, 196, 'Designation')
(40, 49, 'Location')
(23, 38, 'Companies worked at')
(11, 20, 'Designation')
(0, 10, 'Name')


In [25]:
nlp = spacy.blank("en")
nlp.pipe_names

[]

In [26]:
@duration
def train_model(train_data):

    if "ner" not in nlp.pipe_names:

        ner = nlp.create_pipe("ner")

        nlp.add_pipe(ner, last=True)
    for (
        _,
        annotation,
    ) in train_data:
        for ent in annotation["entities"]:

            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):

        optimizer = nlp.begin_training()
        for itn in range(EPOCH):
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],
                        [annotations],
                        drop=DROPOUT,
                        sgd=optimizer,
                        losses=losses,
                    )
                except Exception as e:
                    pass


train_model(train_data)

Starting iteration 0
Starting iteration 1
Starting iteration 2
Starting iteration 3
Starting iteration 4
Starting iteration 5
Starting iteration 6
Starting iteration 7
Starting iteration 8
Starting iteration 9
Function train_model took 284.5981 seconds


In [27]:
nlp.to_disk(CUSTOM_NER_MODEL_PATH)

# Load and Test the model


In [28]:
import spacy

In [29]:
resume_ner_model = spacy.load(CUSTOM_NER_MODEL_PATH)
resume_ner_model.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1fb1b229c48>)]

In [30]:
resume_ner_model.entity.labels

('College Name',
 'Companies worked at',
 'Degree',
 'Designation',
 'Email Address',
 'Graduation Year',
 'Location',
 'Name',
 'Skills',
 'UNKNOWN',
 'Years of Experience')

In [31]:
doc = resume_ner_model(
    " 412 954 8546 sumedhns SUMEDH SHAH EDUCATION Carnegie Mellon University Pittsburgh Master of Information Systems Management Business Intelligence and Data Analytics GPA 3 96 4 00 Teaching Assistant Unstructured Data Analytics NoSQL Database Management December 2022 Maharashtra Institute of Technology India Bachelor of Engineering in Electronics and Telecommunication Engineering June 2019 GPA 7 79 10 SKILLS Languages Java Python w framework Pyspark SQL Bash Linux Scala Database and Big Data SQL Oracle MySQL Postgres Hadoop w Hive and Apache Spark ETL Modeling Apache Solr MapReduce PyTorch Tableau Spacy Redis MongoDB and Neo4j database Cloud AWS Certified Solutions Architect Associate Microsoft Azure Azure Data Factory Databricks WORK EXPERIENCE PPG Industries Pittsburgh AI ML Intern May 2022 August 2022 azure pipelines Designed and managed data flows and pipelines in Azure Data Factory for ingestion from relational databases REST APIs Delta Lake Sharepoint and other cloud file storage systems automation Automated the manual process of ingesting Design of Experiment data from workbooks in Sharepoint using PySpark scripts written in Databricks thereby reducing ingestion and curation time by 90 Modak Analytics LLP India Data Engineer September 2019 July 2021 data ingestion Developed PySpark code that minimized ingestion time by 60 to consume terabytes of data from structured and unstructured sources daily using automated bot workflows into the client data lake as Hive tables leading to convenient downstream analysis and data transformation using SQL applications data curation Built complex Spark SQL queries to curate pharmaceutical data on Dataframe collection these were utilized in data analysis and modelling for a elerating the timeline of the drug discovery process by 50 pipeline automation and visualization Conducted the compilation of Python workflows using Subprocess and OS modules which executed Scala scripts to index curated data into Apache Solr and Neo4j graph database for utilization by pharmaceutical SMEs automated workflows would be triggered daily to bring in updated data communication Collaborated with clients and took the initiative to plan strategies to enhance the company s native platform and helped develop creative business requirements for a clinical trials project leadership Spearheaded a team of 5 members who collectively created an efficient data pipeline in PySpark for a client use case that involved crawling ingestion and transformation of terabytes of clinical trials data which resulted in the reduction of original processing time by 50 ACADEMIC PROJECTS Predicting Order Returns March 2022 May 2022 Predicting whether an online order will be returned to an online retailer as a step to increase net margins and improve supply chain inventory management a ording to research this is soon expected to be a trillion dollar problem Initially performing data preparation and feature engineering for downstream modelling and finally predicting the orders which would be returned using Machine Learning classification algorithms such as Decision Tree Random Forest Logistic Regression Naive Bayes and Perceptron Improvement of performance metrics mainly Recall since False Negatives in this case are a bigger concern for a retailer Python Jupyter Notebook Scikit Learn Numpy Plotly Seaborn EDA on H M Transactions Data January 2022 March 2022 Manufactured call to action CTA s for H M regarding production of category colour and pricing using EDA Associating the best channel of sales for the above actionable items Presented the output in a time series format with CTAs for each quarter Sample CTA Quarter 1 Jan March H M should focus on selling lower priced products in the Ladieswear category in Black Blue and Pink colour and sell them through online channels Python Jupyter Notebook Altair Pandas Scipy Numpy Other Projects Travel Planner GUI Recommendation System COMMUNITY SERVICE AND LEADERSHIP EXPERIENCE Live Life Love Life Charity Foundation Fundraising Coordinator October 2017 October 2019 Organized events to spread cancer awareness and raised approximately 40 000 from donations to provide medical assistance for the underprivileged to fight breast cancer Thermax Foundation June 2017 August 2017 Focused on educating underprivileged children in mathematics and science for entrance exams "
)
for ent in doc.ents:
    print(f"{ent.label_.upper():{20}} - {ent.text}")

SKILLS               - Languages Java Python w framework Pyspark SQL Bash Linux Scala Database and Big Data SQL Oracle MySQL Postgres Hadoop w Hive and Apache Spark ETL Modeling Apache Solr MapReduce PyTorch Tableau Spacy Redis MongoDB and Neo4j database Cloud AWS Certified Solutions Architect Associate Microsoft Azure Azure Data Factory Databricks WORK EXPERIENCE PPG Industries Pittsburgh AI ML Intern May 2022 August 2022 azure pipelines Designed and managed data flows and pipelines in Azure Data Factory for ingestion from relational databases REST APIs Delta Lake Sharepoint and other cloud file storage systems automation Automated the manual process of ingesting Design of Experiment data from workbooks in Sharepoint using PySpark scripts written in Databricks thereby reducing ingestion and curation time by 90 Modak Analytics LLP India Data Engineer September 2019 July 2021 data ingestion Developed PySpark code that minimized ingestion time by 60 to consume terabytes of data from stru