### Classifying content of job posting to recognize if section mentioned is regarding job requirements  

#### A List of 459 words were scraped as key words from the primary scrapper. These words were then cleaned of characters, cleaned to remove stop words, tokenized and the top 100 commonly occuring words were gathered. <br> The words of interest where then checked against the top 100 common words and if yes, they are added to the features  and target dataframe and are one-hot-encoded. A simple logistic regression model was run. 

In [9]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

#Natural Language Processing Packages
import re
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [10]:
# Functions for cleaning up data and doing a simple count on the most commonly occuring values 
def clean_file_text(text):
    new_text = re.sub('\n', '', text)
    new_text = re.sub('%', '', new_text)
    new_text = re.sub('@', '', new_text)
    new_text = re.sub(r'[0-9]', '', new_text)
    new_text = new_text.lower()
    return new_text
def remove_stop_words(text):
    clean_text = []
    for t in text: 
        if t in stopwords:
            pass
        else:
            clean_text.append(t)
    return(clean_text)
def corpus_count_words(uploadedFile):
    tokenizer = RegexpTokenizer(r'\w+') # Chunking each word
    word_counter = Counter()
    for file in uploadedFile:
        file_data = clean_file_text(file)
        file_words = tokenizer.tokenize(file_data)
        non_stopwords = remove_stop_words(file_words)
        word_counter.update(non_stopwords)
    return word_counter

In [11]:
# List of commonly occuring stop words from http://www.nltk.org/nltk_data/
stopwords = open("english", "r")
stopwords = [clean_file_text(word) for word in stopwords]

In [12]:
common100words

NameError: name 'common100words' is not defined

In [13]:
qualification = pd.read_csv('qualification.csv', sep=',',header=0)
commonlyOccuring = corpus_count_words(qualification['qualifications'])
common100words = [words for words in commonlyOccuring.keys()]
common100words = common100words[:100]

In [14]:
# Transforming the string data into 
df_rows = []
tokenizer = RegexpTokenizer(r'\w+')
for row in qualification.iloc[:,0]:
    words = tokenizer.tokenize(row)
    df_rows.append([1 if word in words else 0 for word in common100words])      
    
# Transformed dataset and target into binary values 
X = pd.DataFrame(df_rows, columns = common100words)
target = qualification.iloc[:,1]
y = [1 if t == 'yes' else 0 for t in target]
print(f'Shape of features: {X.shape} And length of target: {len(y)}')

Shape of features: (460, 100) And length of target: 460


In [59]:
#X.assign(target_group=y)

### Logistic Regression: 

In [15]:
# split dataset to train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
clf = LogisticRegression(solver = 'liblinear', penalty = 'l2').fit(X_train, y_train)

In [17]:
# predict on train and test set
y_train_predict = clf.predict(X_train)
y_test_predict = clf.predict(X_test)

In [18]:
# calculate train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_predict)
test_accuracy = accuracy_score(y_test, y_test_predict)

In [19]:
# report results
print(f"Train accuracy: {(train_accuracy*100):.2f}%")
print(f"Test accuracy: {(test_accuracy*100):.2f}%")

Train accuracy: 83.54%
Test accuracy: 78.99%


In [20]:
import joblib
filename = 'Reg_model.sav'
joblib.dump(clf, filename)

['Reg_model.sav']

### Regex identifiers

In [2]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [25]:
def pipeline(text):
    top100words = ['job',
 'summary',
 'qualifications',
 'education',
 'training',
 'experience',
 'health',
 'background',
 'requirements',
 'covid',
 'call',
 'center',
 'representative',
 'pay',
 'rate',
 'hr',
 'remote',
 'eastern',
 'time',
 'zone',
 'responsibilities',
 'position',
 'objective',
 'special',
 'notes',
 'essential',
 'skills',
 'competencies',
 'physical',
 'company',
 'overview',
 'mission',
 'statement',
 'core',
 'duties',
 'required',
 'additional',
 'description',
 'detail',
 'calibration',
 'drifts',
 'precision',
 'calibrations',
 'method',
 'monitoring',
 'documentation',
 'reporting',
 'daily',
 'safety',
 'must',
 'able',
 'travel',
 'emsi',
 'united',
 'states',
 'postal',
 'service',
 'external',
 'publication',
 'posting',
 'branch',
 'period',
 'title',
 'facility',
 'location',
 'information',
 'persons',
 'eligible',
 'apply',
 'check',
 'functional',
 'purpose',
 'business',
 'analyst',
 'function',
 'include',
 'limited',
 'following',
 'minimum',
 'abilities',
 'disclaimer',
 'functions',
 'licensure',
 'address',
 'city',
 'state',
 'zip',
 'code',
 'domicile',
 'eeo',
 'learning',
 'management',
 'system',
 'specialist',
 'general',
 'key',
 'knowledge',
 'work',
 'environment',
 'demands']
    clean_text = []
    tokenizer = RegexpTokenizer(r'\w+')
    word = tokenizer.tokenize(text)
    for w in word:
        c_word = re.sub('\n', '', w)
        c_word = re.sub('%', '', c_word)
        c_word = re.sub('@', '', c_word)
        c_word = re.sub(r'[0-9]', '', c_word)
        if w not in stopwords:
            clean_text.append(w)
        else:
            pass
    df_rows1 = [[1 if word in words else 0 for word in common100words]]
    X = pd.DataFrame(df_rows1, columns = common100words)
    return (clean_text)
    #y_test_predict1 = clf.predict(X)
    #test_accuracy = accuracy_score(y_test, y_test_predict1)
    #print(f"Test accuracy: {(test_accuracy*100):.2f}%")

In [3]:
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [28]:
stemed_words = []
for t in qualification.iloc[:,0]:
    words = tokenizer.tokenize(t)
    clean_text = pipeline(t)
    stemed_words.append([porter.stem(w) for w in clean_text])

[['job', 'summari'], ['qualif'], ['educ'], ['train', 'experi'], ['health', 'background', 'requir'], ['covid', '19', 'call', 'center', 'repres'], ['pay', 'rate', '13', 'hr'], ['remot', 'eastern', 'time', 'zone'], ['respons'], ['requir'], ['posit', 'object'], ['respons'], ['qualif'], ['special', 'note'], ['summari'], ['essenti', 'respons'], ['qualif'], ['skill', 'compet'], ['physic', 'requir'], ['compani', 'overview'], ['mission', 'statement'], ['job', 'summari'], ['core', 'job', 'duti'], ['requir', 'experi', 'educ', 'and', 'skill'], ['addit', 'requir'], ['respons'], ['descript', 'detail'], ['calibr', 'drift', 'precis', 'calibr', 'method', '21', 'monitor'], ['document', 'report'], ['daili', 'document'], ['physic', 'requir'], ['safeti'], ['must', 'Be', 'abl', 'travel'], ['about', 'emsi'], ['unit', 'state', 'postal', 'servic'], ['extern', 'public', 'job', 'post', '10503639'], ['branch'], ['job', 'post', 'period'], ['job', 'titl'], ['facil', 'locat'], ['posit', 'inform'], ['person', 'elig',

In [71]:
count = 0 
pos = []
keywords = ['experi','requir','qualif','skill', 'educ']
for i, word in enumerate(stemed_words):
    if len(set(word)&set(keywords)) != 0: 
        pos.append(i)
        count += 1
        

In [72]:
print(f'The regex identifiers classified as yes: {count} and absolute values set {(qualification.iloc[:,1].value_counts()["yes"])}')

The regex identifiers classified as yes: 72 and absolute values set 82


### Manual checking to ensure correct classification

In [97]:
for o in range(len(qualification.iloc[:,0])): 
    if qualification.iloc[:,1][o] == 'yes':
        print(f'{o} : {qualification.iloc[:,0][o]}')


1 : QUALIFICATIONS
2 : Education
3 : Training and Experience
4 : Health and Background Requirements
9 : Requirements: 
16 : QUALIFICATIONS

17 : SKILLS / COMPETENCIES

18 : PHYSICAL REQUIREMENTS

23 : REQUIRED EXPERIENCE, EDUCATION AND SKILLS
24 : ADDITIONAL REQUIREMENTS
25 : Responsibilities: 
30 : Physical Requirements 
41 : 
Persons Eligible to Apply
44 : 
Requirements
47 : Business Analyst Responsibilities: To include but not limited to the following
48 : Business Analyst Minimum Requirements: 
49 : Business Analyst Abilities Required: 
52 : Qualifications: 
54 : 
Minimum Education
55 : 
Minimum Experience
56 : 
Required Skills, Abilities and / or Licensure
66 : 
Minimum Education
67 : 
Minimum Experience
68 : 
Required Skills, Abilities and / or Licensure
80 : Knowledge/Skills Required: 
81 : Qualifications (Education Requirements/Experience): 
87 : Qualifications
88 : 
Qualifications
89 : Minimum Job Requirements:
90 : 

91 : Preferred Qualifications:
103 : 
Essential Functions
1

In [84]:
for p in pos: 
    print(f'{p}: {qualification.iloc[:,0][p]}')

1: QUALIFICATIONS
3: Training and Experience
4: Health and Background Requirements
9: Requirements: 
12: Qualifications
16: QUALIFICATIONS

17: SKILLS / COMPETENCIES

18: PHYSICAL REQUIREMENTS

23: REQUIRED EXPERIENCE, EDUCATION AND SKILLS
24: ADDITIONAL REQUIREMENTS
30: Physical Requirements 
44: 
Requirements
48: Business Analyst Minimum Requirements: 
49: Business Analyst Abilities Required: 
52: Qualifications: 
55: 
Minimum Experience
56: 
Required Skills, Abilities and / or Licensure
67: 
Minimum Experience
68: 
Required Skills, Abilities and / or Licensure
80: Knowledge/Skills Required: 
81: Qualifications (Education Requirements/Experience): 
87: Qualifications
88: 
Qualifications
89: Minimum Job Requirements:
91: Preferred Qualifications:
105: 
Minimum Experience
106: 
Required Skills, Abilities and / or Licensure
117: 
Requirements
145: Skills/Knowledge Considered a Plus:
146: 
Minimum Qualifications:
153: PROFESSIONAL REQUIREMENTS
160: 
Qualifications
161: 
Required (Basic) 

### Model as a seperate entitiy 

In [21]:
qualification.iloc[:,0]

0                             JOB SUMMARY
1                          QUALIFICATIONS
2                               Education
3                 Training and Experience
4      Health and Background Requirements
                      ...                
455                        Nice to Haves:
456                        What We Offer:
457                Leadership Experience 
458                     Technical Skills 
459            Non-Technical Requirements
Name: qualifications, Length: 460, dtype: object

In [22]:
test_string = ['Veterinary Technician']

In [110]:
pipeline(test_string[0])

Unnamed: 0,job,summary,qualifications,education,training,experience,health,background,requirements,covid,...,learning,management,system,specialist,general,key,knowledge,work,environment,demands
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Trying out Spacy

In [23]:
import spacy
from spacy.matcher import Matcher
import en_core_web_sm
nlp = en_core_web_sm.load()

matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)

In [24]:
nlp

<spacy.lang.en.English at 0x13ae79370>

In [25]:
doc = nlp("jhjk")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

In [1]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")
doc = nlp("The a (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

Found match: United States
Found match: U.S.
Found match: US
