Importing NLP model packages and general packages

In [1]:
import os
import pandas as pd
import numpy as np
from math import isclose

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer

from sklearn.preprocessing import normalize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GEOFF\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GEOFF\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Setup your directory to that of the folder containing the project

In [2]:
dir = 'C:/Users/GEOFF/OneDrive/Documents/Apziva/Potential_Talents'
os.chdir(dir)

Importing the raw data

In [3]:
## Import the raw data into a dataframe
data_path = "data/raw/"
dataframe = pd.read_csv(data_path + "potential-talents - Aspiring human resources - seeking human resources.csv")
print('\nData imported')
dataframe.drop_duplicates(inplace = True, subset = ['job_title', 'location', 'connection'])
dataframe.reset_index(drop = True, inplace = True)
## Remove a non informative feature
dataframe.drop(inplace = True, labels=['id'], axis = 1)
print('''Column 'id' and duplicate rows were removed''')
##

## Display basic information about the dataframe
types = [type(c) for c in dataframe.columns]
print('\nThe dataframe columns and their types are:\n', dict(zip(dataframe.columns, types)))
print(f"\nThe dataframe shape is {dataframe.shape}")
##


Data imported
Column 'id' and duplicate rows were removed

The dataframe columns and their types are:
 {'job_title': <class 'str'>, 'location': <class 'str'>, 'connection': <class 'str'>, 'fit': <class 'str'>}

The dataframe shape is (53, 4)


Pre-processing of the job_title and location strings

In [4]:
# All the job titles, locations and number of connections will be pre-processed before encoding
jobs = list(dataframe['job_title'])
locations = list(dataframe['location'])
connections = list(dataframe['connection'])
processed_jobs = []
processed_locations = []
processed_connections = np.zeros(len(connections))

def process_string(txt):
    # Returns a list of processed words extracted from txt string
    words = [] # Will contain the words in the txt string
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # Next, separate the words
    seps = [' '] + list(string.punctuation)
    default_sep = seps[0]
    for sep in seps[1:]:
        txt = txt.replace(sep, default_sep)
    word_list = [i.strip().lower() for i in txt.split(default_sep)]
    #
    
    def not_a_number(word):
        # Returns if the word is or contains a number
        for i in word:
            if i in string.digits:
                return False
        return True

    # Remove stop words from the list and stems
    for word in word_list:
        if word == 'hr':
            words.append('human'); words.append('resourc')
        elif (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation and
                    len(word) > 1 and not_a_number(word)):  # remove punctuation
                stem_word = stemmer.stem(word)  # stemming word
                words.append(stem_word)
    return ' '.join(words)

for job in jobs:
    processed_jobs.append(process_string(job))
for loc in locations:
    processed_locations.append(process_string(loc))
for i, con in enumerate(connections):
    if con.strip() == '500+':
        processed_connections[i] = 1
    elif con.strip() == '0':
        processed_connections[i] = 0
    else:
        processed_connections[i] = np.log(int(con))/np.log(500)

processed = dict( [ (i, (processed_jobs[i], processed_locations[i], processed_connections[i], 0) ) for i in range(len(connections)) ] )
processed_data = pd.DataFrame.from_dict(processed, orient = 'index', columns = ['job_title', 'location', 'log_connections', 'fit'])
processed_example = processed_data['job_title'][0]
print('The first job title:\n', dataframe['job_title'][0])
print('The first job title is pre-processed as:\n', processed_example)
## Now all the data has been pre-processed

The first job title:
 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional
The first job title is pre-processed as:
 bauer colleg busi graduat magna cum laud aspir human resourc profession


Encoding of the job_title and location string

In [5]:
## Defining the embedding model
model_name = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(model_name)

def job_embedding(job):
    # Returns a saved value of the job embedding, or encodes it
    ind = dict(enumerate(np.where(processed_data['job_title'] == job)[0])).get(0)
    if encoded and ind != None:
        return encoded_data['job_title'].loc[ind]
    ind = dict(enumerate(np.where(processed_data['location'] == job)[0])).get(0)
    if encoded and ind != None:
        return encoded_data['location'].loc[ind]
    encoding = model.encode(job, normalize_embeddings = True)
    return encoding

def encode_jobs():
    # Encodes the preprocessed jobs and locations
    encoded_jobs = np.array([job_embedding(job) for job in processed_jobs])
    encoded_locations = np.array([job_embedding(loc) for loc in processed_locations])
    data = np.concatenate((encoded_jobs, encoded_locations))
    enc_samples = dict( [ (i, (data[i], data[i+len(processed_connections)], processed_connections[i], 0)) for i in range(len(connections)) ] )
    encoded_data = pd.DataFrame.from_dict(enc_samples, orient = 'index', columns = ['job_title', 'location', 'log_connections', 'fit'])
    return encoded_data

encoded = False
encoded_data = encode_jobs()
encoded = True # Now all the data has been encoded
encoded_example = encoded_data['job_title'][0]
print('The first job title is encoded as:\n', encoded_example)
job_titles = dataframe['job_title']

The first job title is encoded as:
 [ 2.29601394e-02  5.29033272e-03  1.23129478e-02 -1.52566116e-02
  2.62993630e-02  2.87623107e-02  6.21659271e-02  1.25092585e-02
  2.13480592e-02 -1.14011122e-02  5.59398942e-02 -3.47674564e-02
  2.57837474e-02  2.77773067e-02  4.16573100e-02 -2.98467092e-03
 -3.81730590e-03  1.44228656e-02 -3.12444046e-02 -2.88489778e-02
 -6.44241050e-02  1.40412776e-02 -3.37941870e-02  4.46511284e-02
  6.03639036e-02 -3.46612409e-02  4.96585965e-02  9.08143818e-03
  1.52817573e-02  3.07963211e-02  6.50279447e-02  1.14242788e-02
 -3.43585461e-02 -1.24222990e-02  1.72793864e-06 -1.12664634e-02
 -2.18778625e-02 -1.94274206e-02 -4.32560779e-02 -4.73480187e-02
  3.41828205e-02  7.90508091e-02 -2.95838807e-02 -4.17338498e-03
 -4.60210554e-02  8.49153548e-02  2.01138984e-02  4.81949002e-02
 -6.39029592e-02  3.75806564e-03  7.48163136e-03 -6.67717531e-02
 -1.49225164e-02 -5.89881837e-02 -3.35062109e-02 -6.51961863e-02
  2.76170182e-03 -4.90266308e-02  8.01380351e-02  8.47

Defining the way similar candidates are ranked

In [6]:
def cosine_similarity(vec1, vec2):
    # Returns the cosine similarity between vectors with norm == 1
    norm1 = np.linalg.norm(vec1); norm2 = np.linalg.norm(vec2)
    assert np.isclose(norm1,1); assert np.isclose(norm2,1)
    return np.dot(vec1, vec2)

def similar_jobs(job, location = None, connec = None, update = False):
    # Returns a list of jobs most similar to 'job' based on their ranking
    # If location is provided, it will be accounted for
    # If connec is provided, it will be accounted for
    # If update is set to True, dataframes will be sorted according to fitness
    cosine_list = [] # Similarity ranking
    if type(job) == str:
        emb = job_embedding(job)
    else:
        emb = job # already encoded
    if location != None:
        emb += job_embedding(location) # account for location
    if connec != None:
        emb = np.concatenate((emb, [connec])) # account for connections
    emb = np.array(normalize([emb])[0])
    # Embedding of input job complete.
    ## Next, go through all possible candidates
    for i, tup in enumerate(zip(processed_jobs, processed_locations, processed_connections)):
        job_, loc_, con_= tup
        vec = job_embedding(job_) + (location!=None)*job_embedding(loc_)
        if connec != None:
            vec = np.concatenate((vec, [con_]))
        vec = np.array(normalize([vec])[0])
        cosine_list.append(cosine_similarity(emb, vec))
    if update:
        update_dataframe(cosine_list)
    ranking = [[dataframe.index[i]] + dataframe.values[:10].tolist()[i] for i in range(10)]
    return ranking

def update_dataframe(cosine_list):
    for i, cos in enumerate(cosine_list):
        dataframe.loc[i, ['fit']] = cos
        processed_data.loc[i, ['fit']] = cos
        encoded_data.loc[i, ['fit']] = cos
    dataframe.sort_values(by=['fit'], ascending = False, inplace = True)
    processed_data.sort_values(by=['fit'], ascending = False, inplace = True)
    encoded_data.sort_values(by=['fit'], ascending = False, inplace = True)
    return None

def star_rank(keyword, star = [], location = None, connec = None, weights = [0.4, 0.5, 0.1]):
    # Returns a ranking based on keyword provided and starred candidates
    # star is the list of job_title starred (string, exact match) (in chronological order)
    key_emb = job_embedding(process_string(keyword))
    key_w, star_w, prestar_w = weights # Importance accredited to keyword vs starred vs previously starred
    if len(star) > 0:
        star_emb = job_embedding(star.pop())
        if len(star) > 0:
            prestar_emb = np.average([job_embedding(s) for s in star], axis = 0)
            prestar_emb = np.squeeze(normalize(prestar_emb.reshape(1,-1), axis = 1))
            vec = key_w*key_emb + star_w*star_emb + prestar_w*prestar_emb
        else:
            vec = key_w*key_emb + star_w*star_emb
    else:
        vec = key_emb
    # Now the vector vec is an amalgation of the embeddings of the keyword + starred candidates
    vec = np.squeeze(normalize(vec.reshape(1,-1), axis = 1))
    ranking = similar_jobs(vec, location = location, connec = connec, update = True)
    return ranking

Below are the executive functions

In [7]:
star_ind = [] # List of indices of starred candidates

def reset_starring():
    # Removes all of the stars
    star_ind = []
    return None

def star(ind):
    # Stars the candidate by providing its index ind in the dataframe
    star_ind.append(ind)
    job, loc = dataframe['job_title'][ind], dataframe['location'][ind]
    print('The candidate %s was starred with index %s'%((job, loc), ind))
    return None

def rank(keyword, location = None):
    print('\nThe keyword provided is: '+keyword)
    processed_starred = [processed_jobs[ind] for ind in star_ind]
    location, connec = None, None
    if len(star_ind) > 0:
        if location == None:
            location = processed_locations[star_ind[-1]]
        connec = processed_connections[star_ind[-1]]
    ranking = star_rank(keyword, star = processed_starred.copy(), location = location, connec = connec)
    print('\nRanking with fit probability:\n')
    for ranked in ranking:
        print(*zip(['index']+list(dataframe.columns), ranked))
    print('\nConsult the dataframe variable for the complete list\n')
    return ranking

In [8]:
reset_starring() ## This command clears any previous starring action

Below, the keyword can be specified

In [9]:
## Now providing the keywords
keywords = ['Aspiring human resources',  'seeking human resources']
keyword = keywords[1]

Below, a ranking based purely on the above-specified keyword is made

In [10]:
ranking_nostar = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 13) ('job_title', 'Seeking Human Resources Opportunities') ('location', 'Chicago, Illinois') ('connection', '390') ('fit', 0.8730392042023424)
('index', 47) ('job_title', 'Seeking Human Resources Position') ('location', 'Las Vegas, Nevada Area') ('connection', '48') ('fit', 0.8367275564055341)
('index', 49) ('job_title', 'Human Resources Generalist at Loparex') ('location', 'Raleigh-Durham, North Carolina Area') ('connection', '500+ ') ('fit', 0.6269668681912686)
('index', 36) ('job_title', 'Human Resources Management Major') ('location', 'Milpitas, California') ('connection', '18') ('fit', 0.6180900576092362)
('index', 48) ('job_title', 'Aspiring Human Resources Manager | Graduating May 2020 | Seeking an Entry-Level Human Resources Position in St. Louis') ('location', 'Cape Girardeau, Missouri') ('connection', '103') ('fit', 0.612324722633486)
('index', 22) ('job_title', 'Human Resources Profes

In [11]:
# Star a candidate
star(26)

The candidate ("Human Resources Generalist at Schwan's", 'Amerika Birleşik Devletleri') was starred with index 26


In [12]:
# Rank the candidates based on the keyword + starred candidate
ranking = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 26) ('job_title', "Human Resources Generalist at Schwan's") ('location', 'Amerika Birleşik Devletleri') ('connection', '500+ ') ('fit', 0.9729696242938572)
('index', 42) ('job_title', 'Seeking Human  Resources Opportunities. Open to travel and relocation.') ('location', 'Amerika Birleşik Devletleri') ('connection', '415') ('fit', 0.9338475852330026)
('index', 7) ('job_title', 'HR Senior Specialist') ('location', 'San Francisco Bay Area') ('connection', '500+ ') ('fit', 0.7705070568104371)
('index', 4) ('job_title', 'Advisory Board Member at Celal Bayar University') ('location', 'İzmir, Türkiye') ('connection', '500+ ') ('fit', 0.7705057298061428)
('index', 6) ('job_title', 'Student at Humber College and Aspiring Human Resources Generalist') ('location', 'Kanada') ('connection', '61') ('fit', 0.7647589408817448)
('index', 49) ('job_title', 'Human Resources Generalist at Loparex') ('location', 'Ra

In [13]:
# Star another candidate
star(21)

The candidate ('Aspiring Human Resources Manager, seeking internship in Human Resources.', 'Houston, Texas Area') was starred with index 21


In [14]:
# Re-rank based on the keyword + starred candidate (and learning from previously starred candidate(s))
ranking = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 21) ('job_title', 'Aspiring Human Resources Manager, seeking internship in Human Resources.') ('location', 'Houston, Texas Area') ('connection', '7') ('fit', 0.9352311482234041)
('index', 12) ('job_title', 'Aspiring Human Resources Management student seeking an internship') ('location', 'Houston, Texas Area') ('connection', '500+ ') ('fit', 0.7680618076906078)
('index', 10) ('job_title', 'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR') ('location', 'Houston, Texas Area') ('connection', '500+ ') ('fit', 0.7619625080527194)
('index', 23) ('job_title', 'Nortia Staffing is seeking Human Resources, Payroll & Administrative Professionals!!  (408) 709-2621') ('location', 'San Jose, California') ('connection', '500+ ') ('fit', 0.7286836864880125)
('index', 8) ('job_title', 'Seeking Human Resources HRIS and Generalist Positions') ('location', 