# This program uses spaCy's pretrained pipeline to fine tune the model for our application

In [None]:
#Import and load the spacy model
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span, DocBin
from spacy.language import Language
import unicodedata


nlp = spacy.load("en_core_web_lg")
import csv

#getting the data from the csv and turning it into text that can be manipulated by spaCy's pipeline
TEXTS = []
with open("../data/related_skills.csv", mode='r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)  # Skip the first row
    for row in reader:
        TEXTS.extend(row)


we want to add a component before the data gets to the NER model
we want to add patterns to the model for the it to identify in the training data

In [16]:
#creating pattern
skills_patterns = list(nlp.pipe(TEXTS))

In [17]:

#just printing the first few skills in the list
i = 0
while i <= 10:
    print(skills_patterns[i].text)
    i += 1

a certified
network certified
laptops
n certified
computer hardware
windows 7
comptia
troubleshooting
software installation
printers
xp


In [18]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("SKILL", skills_patterns)

In [19]:
# This is the component...we may have to take a different approach
@Language.component("skills_component")
def skills_component_function(doc):
    #applying matcher to the doc
    matches = matcher(doc)
    #creating the span for each match and assigning the label "SKILL"
    spans = [Span(doc, start, end, label = "SKILL") for match_id, start, end in matches]
    #were going to overite the doc.ents with our new spans
    doc.ents = spans 
    return doc


In [20]:
# add 2 pipeline
nlp.add_pipe("skills_component", after="lemmatizer"),
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'skills_component', 'ner']


In [50]:
# Process test text and print the text and label for the doc.ents
doc = nlp("java is required, looking for IT specialist")
print([(ent.text, ent.label_) for ent in doc.ents])


doc = nlp("java is required, Good communication")
print([(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("We are looking for proficient programmers to join our team to train our AI chatbots to code. You will work with the chatbots that we are building in order to measure their progress, as well as write and evaluate code.To apply to this role, you will need to be proficient in either Python and/or JavaScript. Your role will require proficiency in at least one programming language (JavaScript, Python, C#, C++, HTML, SQL, or Swift) in order to solve coding problems (think LeetCode, HackerRank, etc). For each coding problem, you must be able to explain how your solution solves the problem.As part of the application process, you will be asked to complete an assessment. If you pass, you will gain access to projects. Based on the quality of your work, you will continue to receive projects regularly. We find our most successful candidates work between 5-20 hours per week, up to 40 hours.")
print([(ent.text, ent.label_) for ent in doc.ents])


# This is a description taken from the api
doc = nlp("Greetings from IT Engagements\u2026!  IT Engagements is a global staff augmentation firm providing a wide-range of talent on-demand and total workforce solutions. We have an immediate opening for the below position with one of our premium clients.  Job Title:- JavaScript Backend Developer  Duration:- 12+ Months Contract (Possibility of further extension)  Location:- Hybrid (3day/Week Onsite) at Chicago, IL.  Visa:- USC, GC, GC-EAD, H4-EAD (No Third-Party Candidates)  Job Description:- \u2022 Communication skills are the KEY! We need a Solution type person. This person is going to have to have a great personality and awesome communication skills \u2022 JavaScript Backend Developer (NO Front-end work) \u2013 ALL JavaScript backend Development. \u2022 Must have 10 years of JavaScript backend Development experience. \u2022 Must have SDK Experience.  Regards,  Shashank Jaitly  Shashank@itengagements.com")
print([(ent.text, ent.label_) for ent in doc.ents])



[('java', 'SKILL')]
[('java', 'SKILL'), ('communication', 'SKILL')]
[('programmers', 'SKILL'), ('team', 'SKILL'), ('code', 'SKILL'), ('building', 'SKILL'), ('write', 'SKILL'), ('code', 'SKILL'), ('Python', 'ORG'), ('JavaScript', 'PRODUCT'), ('at least one', 'CARDINAL'), ('programming', 'SKILL'), ('language', 'SKILL'), ('JavaScript', 'PRODUCT'), ('Python', 'ORG'), ('C++', 'LANGUAGE'), ('HTML', 'ORG'), ('SQL', 'ORG'), ('Swift', 'PRODUCT'), ('coding', 'SKILL'), ('HackerRank', 'ORG'), ('etc', 'SKILL'), ('coding', 'SKILL'), ('application', 'SKILL'), ('process', 'SKILL'), ('assessment', 'SKILL'), ('access', 'SKILL'), ('projects', 'SKILL'), ('quality', 'SKILL'), ('projects', 'SKILL'), ('5-20 hours', 'TIME'), ('up to 40 hours', 'TIME')]


ValueError: [E1010] Unable to set entity information for token 12 which is included in more than one span in entities, blocked, missing or outside.