# This program uses spaCy's pretrained pipeline to fine tune the model for our application

In [1]:
#Import and load the spacy model
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span, DocBin
from spacy.language import Language

nlp = spacy.load("en_core_web_lg")
import csv

#getting the data from the csv and turning it into text that can be manipulated by spaCy's pipeline
TEXTS = []
with open("../data/related_skills.csv", mode='r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)  # Skip the first row
    for row in reader:
        TEXTS.extend(row)


we want to add a component before the data gets to the NER model
we want to add patterns to the model for the it to identify in the training data

In [2]:
#creating pattern
skills_patterns = list(nlp.pipe(TEXTS))

In [3]:

#just printing the first few skills in the list
i = 0
while i <= 10:
    print(skills_patterns[i].text)
    i += 1

a certified
network certified
laptops
n certified
computer hardware
windows 7
comptia
troubleshooting
software installation
printers
xp


In [4]:
matcher = PhraseMatcher(nlp.vocab)
matcher.add("SKILL", skills_patterns)

In [6]:
# This is the component...we may have to take a different approach
@Language.component("skills_component")
def skills_component_function(doc):
    #applying matcher to the doc
    matches = matcher(doc)
    #creating the span for each match and assigning the label "SKILL"
    spans = [Span(doc, start, end, label = "SKILL") for match_id, start, end in matches]
    #were going to overite the doc.ents with our new spans
    doc.ents = spans 
    return doc