In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
import pandas as pd
df = pd.read_csv(r"C:/Users/uchei/OneDrive/Desktop/Gozie/Raw_Skills_Dataset.csv")

In [2]:
df.head()

Unnamed: 0,RAW DATA
0,What ifs
1,seniority
2,familiarity
3,functionalities
4,Lambdas


In [3]:
df.shape

(34116, 1)

There are 34116 samples in the raw data. Let's filter out the useful information.

In [4]:
def patterns(text):
    doc = nlp(text)
    patterns = []
    for token in doc:
        patterns.append(token.pos_)
    return patterns

Let's extract the patterns from the example dataset.

In [5]:
examples = pd.read_csv(r"C:/Users/uchei/OneDrive/Desktop/Gozie/Example_Technical_Skills.csv")
examples.head()

Unnamed: 0,Technology Skills
0,SAP Fiori Developer
1,Oracle Instance Management & Strategy
2,Boomi Master Data Management
3,Digital Manufacturing on Cloud ( DMC)
4,DevOps


In [6]:
examples['Patterns'] = examples['Technology Skills'].apply(lambda x:patterns(x))

In [7]:
examples.head()

Unnamed: 0,Technology Skills,Patterns
0,SAP Fiori Developer,"[PROPN, PROPN, PROPN]"
1,Oracle Instance Management & Strategy,"[PROPN, PROPN, PROPN, CCONJ, PROPN]"
2,Boomi Master Data Management,"[PROPN, PROPN, PROPN, PROPN]"
3,Digital Manufacturing on Cloud ( DMC),"[PROPN, PROPN, ADP, PROPN, PUNCT, PROPN, PUNCT]"
4,DevOps,[PROPN]


Using these defined patterns and finding similarities, we can create our own Rule-Based POS Matching

In [8]:
def extract_tech(text):
    doc = nlp(text)
    skills = []
    if len(text.split()) == 1:
        pattern = [{'POS': 'PROPN'}]
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop", [pattern])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    elif len(text.split()) == 2:
        pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}],
                    [{'POS': 'NOUN'}, {'POS': 'VERB'}],
                   [{'POS': 'PROPN'}, {'POS': 'NOUN'}],
                   [{'POS': 'NOUN'}, {'POS': 'NUM'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}]]
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop1", [pattern[0]])
        matcher.add("Prop12", [pattern[1]])
        matcher.add("Prop13", [pattern[2]])
        matcher.add("Prop14", [pattern[3]])
        matcher.add("Prop15", [pattern[4]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    elif len(text.split()) == 3:
        pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'NOUN'}, {'POS': 'CCONJ'}, {'POS': 'NOUN'}],
                   [{'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}]]
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop2", [pattern[0]])
        matcher.add("Prop21", [pattern[1]])
        matcher.add("Prop22", [pattern[2]])
        matcher.add("Prop23", [pattern[3]])
        matcher.add("Prop24", [pattern[4]])
        matcher.add("Prop25", [pattern[5]])
        matcher.add("Prop26", [pattern[6]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    elif len(text.split()) == 4:
        pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'NOUN'}, {'POS': 'ADP', 'OP': '?'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}],
                   [{'POS': 'NOUN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}], 
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'NOUN'}, {'POS': 'PUNCT'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}],
                   [{'POS': 'NUM'}, {'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]]
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop2", [pattern[0]])
        matcher.add("Prop21",[pattern[1]])
        matcher.add("Prop22", [pattern[2]])
        matcher.add("Prop23", [pattern[3]])
        matcher.add("Prop24", [pattern[4]])
        matcher.add("Prop25", [pattern[5]])
        matcher.add("Prop26", [pattern[6]])
        matcher.add("Prop27", [pattern[7]])
        matcher.add("Prop28", [pattern[8]])
        matcher.add("Prop29", [pattern[9]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    elif len(text.split()) == 5:
        pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'CCONJ'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'NOUN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'CCONJ'}, {'POS': 'NOUN'}],
                   [{'POS': 'PROPN'}, {'POS': 'ADJ'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}],
                   [{'POS': 'PROPN'}, {'POS': 'ADJ'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'NOUN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}],
                   [{'POS': 'PROPN'}, {'POS': 'SYM'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'CCONJ'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]]
                                                                                                              
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop3", [pattern[0]])
        matcher.add("Prop31", [pattern[1]])
        matcher.add("Prop32", [pattern[2]])
        matcher.add("Prop33", [pattern[3]])
        matcher.add("Prop34", [pattern[4]])
        matcher.add("Prop35", [pattern[5]])
        matcher.add("Prop36", [pattern[6]])
        matcher.add("Prop37", [pattern[7]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    elif len(text.split()) == 6:
        pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'ADJ'}, {'POS': 'NOUN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'CCONJ'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}],
                   [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'PUNCT'}, {'POS': 'NOUN'}, {'POS': 'CCONJ'}, {'POS': 'NOUN'}],
                   [{'POS': 'PROPN'}, {'POS': 'PUNCT'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]]
        matcher = Matcher(nlp.vocab)
        matcher.add("Prop4", [pattern[0]])
        matcher.add("Prop41", [pattern[1]])
        matcher.add("Prop42", [pattern[2]])
        matcher.add("Prop43", [pattern[3]])
        matcher.add("Prop44", [pattern[4]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            skills.append(span.text)
            
    return skills

In [9]:
def soft_skills(text):
    doc = nlp(text)
    soft_skills = []
    if len(text.split()) == 1:
        pattern = [{'POS': 'NOUN'}]
        matcher = Matcher(nlp.vocab)
        matcher.add("soft", [pattern])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            soft_skills.append(span.text)
    elif len(text.split()) == 2:
        pattern = [[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
                   [{'POS': 'NOUN'}, {'POS': 'NOUN'}]]
        matcher = Matcher(nlp.vocab)
        matcher.add("soft", [pattern[0]])
        matcher.add("soft", [pattern[1]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            soft_skills.append(span.text)
    elif len(text.split()) == 3:
        pattern = [[{'POS':'NOUN'},{'POS': 'PART'}, {'POS': 'VERB'}],
                    [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}],
                   [{'POS': 'NOUN'}, {'POS': 'ADJ'}, {'POS': 'NOUN'}]]
                    
        matcher = Matcher(nlp.vocab)
        matcher.add("soft", [pattern[0]])
        matcher.add("soft", [pattern[1]])
        matcher.add("soft", [pattern[2]])
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            soft_skills.append(span.text)
    return soft_skills

In [10]:
Soft_Skills = df['RAW DATA'].apply(lambda x:soft_skills(x))

In [11]:
Technical_Skills = df['RAW DATA'].apply(lambda x: extract_tech(x))

In [12]:
#eliminate empty values
Technical_Skills = Technical_Skills[Technical_Skills.apply(len) > 0]
len(Technical_Skills)

8421

There are 8421 technical skills according to the defined patterns derived from the example dataset.

In [13]:
Technical_Skills.head()

5             [Java Streams]
7     [Relational Databases]
8                      [SQL]
11               [Hibernate]
12                 [MyBatis]
Name: RAW DATA, dtype: object

In [14]:
#eliminate empty values
Soft_Skills = Soft_Skills[Soft_Skills.apply(len) > 0]
len(Soft_Skills)

12899

Using the patterns defined, there are 12899 soft skills contained in the dataset. Let's preview some.

In [15]:
Soft_Skills.head()

1          [seniority]
2        [familiarity]
3    [functionalities]
4            [Lambdas]
9                [ORM]
Name: RAW DATA, dtype: object

There are some anomalies in the soft skills extracted, as well as some omitted from the technical skills dataset. This is owing to the duality of the singular POS tag NOUN, which can appear as both a technical skill and soft skill. However, the singular NOUN tag predominantly contains soft skills, hence the grouping into the soft skills category.