In [1]:
# Following this guide - https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy

import spacy
import pymongo
from pymongo import MongoClient

nlp = spacy.load('en_core_web_sm')

client = MongoClient('mongodb://localhost:27017/WW?retryWrites=true&w=majority')
db = client['WW_Local']
NLP_jobs_collection = db['NLPJobsLocal']

In [2]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [3]:
x = nlp((NLP_jobs_collection.find_one({}))['skills'])
for word in x.ents:
    print(word.text, word.start_char, word.end_char, word.label_)

1Password 736 745 CARDINAL


In [6]:
# Helper function to split string into character indexes to build TRAIN_DATA

def split_text(text):
    i = 0
    for char in text:
        print(str(i) + ': ' + char)
        i += 1
        
split_text('Required skills: Front End: - Angular, HTML5, CSS3, JavaScript, TypeScript, Bootstrap, ExtJS Front End Testing; - Jasmine, Karma, PostMan Backend: - Java, Spring, SpringBoot, RESTapi, Nodejs, SQL, MongoDB, Postgres Backend testing: - Junit, PowerMock - AWS is a plus.')

0: R
1: e
2: q
3: u
4: i
5: r
6: e
7: d
8:  
9: s
10: k
11: i
12: l
13: l
14: s
15: :
16:  
17: F
18: r
19: o
20: n
21: t
22:  
23: E
24: n
25: d
26: :
27:  
28: -
29:  
30: A
31: n
32: g
33: u
34: l
35: a
36: r
37: ,
38:  
39: H
40: T
41: M
42: L
43: 5
44: ,
45:  
46: C
47: S
48: S
49: 3
50: ,
51:  
52: J
53: a
54: v
55: a
56: S
57: c
58: r
59: i
60: p
61: t
62: ,
63:  
64: T
65: y
66: p
67: e
68: S
69: c
70: r
71: i
72: p
73: t
74: ,
75:  
76: B
77: o
78: o
79: t
80: s
81: t
82: r
83: a
84: p
85: ,
86:  
87: E
88: x
89: t
90: J
91: S
92:  
93: F
94: r
95: o
96: n
97: t
98:  
99: E
100: n
101: d
102:  
103: T
104: e
105: s
106: t
107: i
108: n
109: g
110: ;
111:  
112: -
113:  
114: J
115: a
116: s
117: m
118: i
119: n
120: e
121: ,
122:  
123: K
124: a
125: r
126: m
127: a
128: ,
129:  
130: P
131: o
132: s
133: t
134: M
135: a
136: n
137:  
138: B
139: a
140: c
141: k
142: e
143: n
144: d
145: :
146:  
147: -
148:  
149: J
150: a
151: v
152: a
153: ,
154:  
155: S
156: p
157: r
158:

In [2]:
# Training to identify technologies and companies

ner = nlp.get_pipe('ner')

TRAIN_DATA = [
    ("Experience in the following is a strong asset: Java, J2EE, Spring, SOAP REST web services, XML, JSON HTML, CSS SCSS, Javascript, jQuery, React.js A focused, detail oriented approach to completing tasks", 
         {"entities": [
             (47, 51, "PRODUCT"),
             (53, 57, "PRODUCT"),
             (59, 65, "PRODUCT"),
             (67, 71, "PRODUCT"),
             (72, 76, "PRODUCT"),
             (91, 94, "PRODUCT"),
             (96, 100, "PRODUCT"),
             (101, 105, "PRODUCT"),
             (107, 110, "PRODUCT"),
             (111, 115, "PRODUCT"),
             (117, 127, "PRODUCT"),
             (129, 135, "PRODUCT"),
             (137, 145, "PRODUCT"),
         ]}),
    ("Experience in scripting programming (Python, R, VB, C, Java, HTML, JavaScript, PHP) - Experience working with Linux UNIX - Knowledge and experience utilizing virtual environments - Experience creating, managing, querying and manipulating data using a RDBMS (MySQL, PostgreSQL, or other)",
         {"entities": [
             (37, 43, "PRODUCT"),
             (45, 46, "PRODUCT"),
             (48, 50, "PRODUCT"),
             (52, 53, "PRODUCT"),
             (55, 59, "PRODUCT"),
             (61, 65, "PRODUCT"),
             (67, 77, "PRODUCT"),
             (79, 82, "PRODUCT"),
             (110, 115, "PRODUCT"),
             (116, 120, "PRODUCT"),
             (258, 263, "PRODUCT"),
             (265, 274, "PRODUCT")
         ]}),
    ("Required skills: Front End: - Angular, HTML5, CSS3, JavaScript, TypeScript, Bootstrap, ExtJS Front End Testing; - Jasmine, Karma, PostMan Backend: - Java, Spring, SpringBoot, RESTapi, Nodejs, SQL, MongoDB, Postgres Backend testing: - Junit, PowerMock - AWS is a plus.", 
         {"entities": [
             (30, 37, "PRODUCT"),
             (39, 44, "PRODUCT"),
             (46, 50, "PRODUCT"),
             (52, 62, "PRODUCT"),
             (64, 74, "PRODUCT"),
             (76, 85, "PRODUCT"),
             (87, 92, "PRODUCT"),
             (114, 121, "PRODUCT"),
             (123, 128, "PRODUCT"),
             (130, 137, "PRODUCT"),
             (149, 153, "PRODUCT"),
             (155, 161, "PRODUCT"),
             (163, 172, "PRODUCT"),
             (175, 179, "PRODUCT"),
             (184, 190, "PRODUCT"),
             (192, 195, "PRODUCT"),
             (197, 204, "PRODUCT"),
             (206, 214, "PRODUCT"),
             (234, 239, "PRODUCT"),
             (241, 250, "PRODUCT"),
             (253, 256, "PRODUCT")
         ]}),
    ("Experience with SQL and NoSQL systems - Knowledge of Hadoop, Spark, Kafka or other equivalent technologies - Proficiency in some of the following languages: Scala, Java, Python, Bash - Experience with automated testing systems - Mentorship, collaboration, and communication skills - Knowledge of data modelling, data warehousing, ETL processes, and business intelligence reporting tools - Experience working with CI CD, containerization, and virtualization tools such as Gitlab, Jenkins, Kubernetes, Docker - Experience with tools like Databricks, Snowflake or PowerBI", 
         {"entities": [
             (16, 19, "PRODUCT"),
             (24, 29, "PRODUCT"),
             (53, 59, "PRODUCT"),
             (61, 66, "PRODUCT"),
             (68, 73, "PRODUCT"),
             (157, 162, "PRODUCT"),
             (164, 168, "PRODUCT"),
             (170, 176, "PRODUCT"),
             (178, 182, "PRODUCT"),
             (471, 477, "PRODUCT"),
             (479, 486, "PRODUCT"),
             (488, 498, "PRODUCT"),
             (500, 506, "PRODUCT"),
             (536, 546, "PRODUCT"),
             (548, 557, "PRODUCT"),
             (561, 568, "PRODUCT")
         ]}),
    ("Accountabilities Under the supervision of a technical lead or manager, build, enhance and troubleshoot applications designed with one or more of the following technologies:  C#.NET, SQL, VB, Microsoft Access Learn and configure low-code tools to automate business processes, including:  Server and desktop robotics tools, Sharepoint, Power BI, Unqork, DUCO", 
         {"entities": [
             (177, 180, "PRODUCT"),
             (182, 185, "PRODUCT"),
             (187, 189, "PRODUCT"),
             (334, 342, "PRODUCT"),
             (344, 350, "PRODUCT"),
             (352, 356, "PRODUCT"),
         ]}),
    ("Our current technical stack includes: Ruby on Rails 5.0, HTML5, Bootstrap 3, JQuery and Less. Leveraging MySQL and Redis for data. We deploy to AWS on Linux backed by RDS. Build an HTML5 experience for mobile users React, JQuery, Bootstrap3, D3.js, MySQL and Redis .Kubernetes, Docker, NodeJS or Kotlin", 
         {"entities": [
             (38, 51, "PRODUCT"),
             (57, 62, "PRODUCT"),
             (64, 73, "PRODUCT"),
             (77, 83, "PRODUCT"),
             (105, 110, "PRODUCT"),
             (115, 120, "PRODUCT"),
             (144, 147, "PRODUCT"),
             (151, 156, "PRODUCT"),
             (181, 186, "PRODUCT"),
             (215, 220, "PRODUCT"),
             (222, 228, "PRODUCT"),
             (230, 240, "PRODUCT"),
             (242, 247, "PRODUCT"),
             (249, 254, "PRODUCT"),
             (259, 264, "PRODUCT"),
             (266, 276, "PRODUCT"),
             (278, 284, "PRODUCT"),
             (286, 292, "PRODUCT"),
             (296, 302, "PRODUCT")
         ]}),
    ("Experience with any of the following is an asset: HTML CSS, Javascript, AngularJS, JQuery, React, Java, C# Node.js, JEE, Apache, PHP, .Net, Spring SQL, PL SQL, Oracle, MySQL, MongoDB, NoSQL Web Services (RESTful SOAP), XML, JSON, AJAX AWS, Google GCP, Azure, Heroku, Cloud Foundry Docker, Kubernetes Git Gitlab", 
         {"entities": [
             (55, 58, "PRODUCT"),
             (60, 70, "PRODUCT"),
             (72, 81, "PRODUCT"),
             (83, 89, "PRODUCT"),
             (91, 96, "PRODUCT"),
             (98, 102, "PRODUCT"),
             (104, 106, "PRODUCT"),
             (107, 114, "PRODUCT"),
             (116, 119, "PRODUCT"),
             (121, 127, "PRODUCT"),
             (129, 132, "PRODUCT"),
             (134, 138, "PRODUCT"),
             (140, 146, "PRODUCT"),
             (147, 150, "PRODUCT"),
             (152, 158, "PRODUCT"),
             (160, 166, "PRODUCT"),
             (168, 173, "PRODUCT"),
             (175, 182, "PRODUCT"),
             (184, 189, "PRODUCT"),
             (204, 208, "PRODUCT"),
             (212, 216, "PRODUCT"),
             (219, 222, "PRODUCT"),
             (224, 228, "PRODUCT"),
             (230, 234, "PRODUCT"),
             (235, 238, "PRODUCT"),
             (247, 250, "PRODUCT"),
             (252, 257, "PRODUCT"),
             (259, 265, "PRODUCT"),
             (281, 287, "PRODUCT"),
             (289, 299, "PRODUCT"),
             (300, 303, "PRODUCT"),
             (304, 310, "PRODUCT")
         ]}),
    ("Working experience developing with PHP, Laravel, VueJS, REST API, jQuery, Bootstrap, MySQL and Python Flask.", 
         {"entities": [
             (35, 38, "PRODUCT"),
             (40, 47, "PRODUCT"),
             (49, 52, "PRODUCT"),
             (56, 60, "PRODUCT"),
             (66, 72, "PRODUCT"),
             (74, 83, "PRODUCT"),
             (85, 90, "PRODUCT"),
             (95, 101, "PRODUCT"),
             (102, 107, "PRODUCT")
         ]}),
    ("Experience with hybrid   native mobile application development (Cordova, Flutter, React Native) -Experience with machine learning platform (Anaconda, Tensorflow, Keras)  -Experience with machine learning libraries (OpenCV, Scikit-learn, Pandas)", 
         {"entities": [
             (64, 71, "PRODUCT"),
             (73, 80, "PRODUCT"),
             (82, 94, "PRODUCT"),
             (140, 148, "PRODUCT"),
             (150, 160, "PRODUCT"),
             (162, 167, "PRODUCT"),
             (215, 221, "PRODUCT"),
             (223, 229, "PRODUCT"),
             (237, 243, "PRODUCT")
         ]}),
    ("Android, Java, Kotlin experience - iOS, Swift experience - Google Cloud or AWS experience - Unit testing and UI testing experience - C#, JavaScript, TypeScript, Postgres experience is a bonus", 
         {"entities": [
             (0, 7, "PRODUCT"),
             (9, 13, "PRODUCT"),
             (15, 21, "PRODUCT"),
             (35, 38, "PRODUCT"),
             (40, 45, "PRODUCT"),
             (59, 70, "PRODUCT"),
             (75, 78, "PRODUCT"),
             (133, 135, "PRODUCT"),
             (137, 147, "PRODUCT"),
             (149, 159, "PRODUCT"),
             (161, 169, "PRODUCT")
         ]}),
    ("Strong ability to develop and debug in Python, Java, C or C++, Proficient in git version control. Strong experience with machine learning APIs and computational packages (TensorFlow, Theano, PyTorch, Keras, Scikit-Learn, NumPy, SciPy, Pandas). Experience with big-data technologies such as Hadoop, Spark, SparkML, etc. Experience with public cloud and services (AWS, Azure) Familiarity with basic data table operations (SQL, Hive, PostGres etc.).", 
         {"entities": [
             (39, 45, "PRODUCT"),
             (47, 51, "PRODUCT"),
             (53, 54, "PRODUCT"),
             (58, 61, "PRODUCT"),
             (77, 80, "PRODUCT"),
             (171, 181, "PRODUCT"),
             (183, 189, "PRODUCT"),
             (191, 198, "PRODUCT"),
             (200, 205, "PRODUCT"),
             (207, 213, "PRODUCT"),
             (221, 226, "PRODUCT"),
             (228, 233, "PRODUCT"),
             (235, 241, "PRODUCT"),
             (290, 296, "PRODUCT"),
             (298, 303, "PRODUCT"),
             (305, 312, "PRODUCT"),
             (362, 365, "PRODUCT"),
             (367, 372, "PRODUCT"),
             (420, 423, "PRODUCT"),
             (425, 429, "PRODUCT"),
             (431, 439, "PRODUCT")
         ]}),
    ("Working with our preferred technology stack (Primarily Elixir Phoenix, Ruby on Rails, modern JavaScript). Develop, scale, and optimize amazing GraphQL & RESTful APIs.", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
         ]})
    
    
    
]

'''
    ("", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
             (153, 165, "PRODUCT")
         ]}),
    ("", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
             (153, 165, "PRODUCT")
         ]}),
    ("", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
             (153, 165, "PRODUCT")
         ]}),
    ("", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
             (153, 165, "PRODUCT")
         ]}),
    ("", 
         {"entities": [
             (55, 61, "PRODUCT"),
             (62, 69, "PRODUCT"),
             (71, 84, "PRODUCT"),
             (93, 103, "PRODUCT"),
             (143, 150, "PRODUCT"),
             (153, 157, "PRODUCT"),
             (153, 165, "PRODUCT")
         ]}),
    '''

'\n    ("", \n         {"entities": [\n             (55, 61, "PRODUCT"),\n             (62, 69, "PRODUCT"),\n             (71, 84, "PRODUCT"),\n             (93, 103, "PRODUCT"),\n             (143, 150, "PRODUCT"),\n             (153, 157, "PRODUCT"),\n             (153, 165, "PRODUCT")\n         ]}),\n    ("", \n         {"entities": [\n             (55, 61, "PRODUCT"),\n             (62, 69, "PRODUCT"),\n             (71, 84, "PRODUCT"),\n             (93, 103, "PRODUCT"),\n             (143, 150, "PRODUCT"),\n             (153, 157, "PRODUCT"),\n             (153, 165, "PRODUCT")\n         ]}),\n    ("", \n         {"entities": [\n             (55, 61, "PRODUCT"),\n             (62, 69, "PRODUCT"),\n             (71, 84, "PRODUCT"),\n             (93, 103, "PRODUCT"),\n             (143, 150, "PRODUCT"),\n             (153, 157, "PRODUCT"),\n             (153, 165, "PRODUCT")\n         ]}),\n    ("", \n         {"entities": [\n             (55, 61, "PRODUCT"),\n             (62, 69

In [3]:
for text, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        
pipe_exceptions = ['ner', 'trf_wordpiecer', 'trf_tok2vec']
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
import warnings

# optimizer = nlp.begin_training()
optimizer = nlp.initialize()

with nlp.disable_pipes(*unaffected_pipes), warnings.catch_warnings():
    warnings.filterwarnings('once', category=UserWarning, module='spacy')
    for iteration in range(50):
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update(
                [example],
                drop = 0.5,
                sgd = optimizer,
                losses = losses
            )
        print(losses)
        
# Look into doc.spans



In [None]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

testing_string = "Qualifications: Enrolled in computer science, computer engineering, or related disciplines 1+ year working experience with object-oriented programming (Java / Python) or JavaScript programming Familiarity with *nix Operating System is an asset Working experiences in software development lifecycle Interest in solving technical problems Solid communication skills A first rate academic record is desired   Technologies you may experience with us: Java, Node.js, Python, Vue.js, Knockout.js Amazon Web Services (EC2, Lambda, CloudFormation, DynamoDB, S3, SQS, SNS, etc.) Apache Kafka, Apache Airflow, Redis, PostgreSQL Databricks, Data Catalog, h2o AI framework GitHub, Bitbucket, Jenkins   We offer competitive salary, flexible work hours, and a great work environment.  Our office lounge has a foosball table, basketball arcade, boardgames and video game consoles, and provides free fruit, coffee, and soft drinks."
testing_string = "The student will manage large dataset in Unix. He or she will design database in SQL to facilitate the analysis. The student will create scripts (r, python or matlab) to modify the analytical pipeline."
doc = nlp(testing_string)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])