Importing NLP model packages and general packages

In [1]:
import os
import pandas as pd
import numpy as np
from math import isclose

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer

from sklearn.preprocessing import normalize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GEOFF\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GEOFF\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Setup your directory to that of the folder containing the project

In [2]:
dir = ''
os.chdir(dir)

Importing the raw data

In [3]:
## Import the raw data into a dataframe
data_path = "data/raw/"
dataframe = pd.read_csv(data_path + "potential-talents - Aspiring human resources - seeking human resources.csv")
print('\nData imported')
dataframe.drop_duplicates(inplace = True, subset = ['job_title', 'location', 'connection'])
dataframe.reset_index(drop = True, inplace = True)
## Remove a non informative feature
dataframe.drop(inplace = True, labels=['id'], axis = 1)
print('''Column 'id' and duplicate rows were removed''')
##

## Display basic information about the dataframe
types = [type(c) for c in dataframe.columns]
print('\nThe dataframe columns and their types are:\n', dict(zip(dataframe.columns, types)))
print(f"\nThe dataframe shape is {dataframe.shape}")
##


Data imported
Column 'id' and duplicate rows were removed

The dataframe columns and their types are:
 {'job_title': <class 'str'>, 'location': <class 'str'>, 'connection': <class 'str'>, 'fit': <class 'str'>}

The dataframe shape is (53, 4)


Pre-processing of the job_title and location strings

In [4]:
# All the job titles, locations and number of connections will be pre-processed before encoding
jobs = list(dataframe['job_title'])
locations = list(dataframe['location'])
connections = list(dataframe['connection'])
processed_jobs = []
processed_locations = []
processed_connections = np.zeros(len(connections))

def process_string(txt):
    '''
    Process the words in an input string:
        - Tokenizes the words in the string
        - Removes stopwords, numbers and punctuation
        - Performs stemming
    
    Arguments:
        txt: String, for example "human resources"
    
    Returns:
        processed: String, for example "human resourc"
    '''
    words = [] # Will contain the words in the txt string
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # Next, tokenize the words
    tokens = word_tokenize(txt)
    tokens = [i.strip().lower() for i in tokens]
    #

    # Remove stop words from the list and stems
    for word in tokens:
        if word == 'hr':
            words.append('human'); words.append('resourc')
        elif (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation and
                    len(word) > 1 and word.isalpha()):  # remove punctuation
                stem_word = stemmer.stem(word)  # stemming word
                words.append(stem_word)
    return ' '.join(words)

## Generate the processed inputs
for job in jobs:
    processed_jobs.append(process_string(job))
for loc in locations:
    processed_locations.append(process_string(loc))
for i, con in enumerate(connections):
    if con.strip() == '500+':
        processed_connections[i] = 1
    elif con.strip() == '0':
        processed_connections[i] = 0
    else:
        processed_connections[i] = np.log(int(con))/np.log(500)

processed = dict( [ (i, (processed_jobs[i], processed_locations[i], processed_connections[i], 0) ) for i in range(len(connections)) ] )
processed_data = pd.DataFrame.from_dict(processed, orient = 'index', columns = ['job_title', 'location', 'log_connections', 'fit'])
processed_example = processed_data['job_title'][0]
print('The first job title:\n', dataframe['job_title'][0])
print('The first job title is pre-processed as:\n', processed_example)
## Now all the data has been pre-processed

The first job title:
 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional
The first job title is pre-processed as:
 bauer colleg busi graduat magna cum laud aspir human resourc profession


Encoding of the job_title and location string

In [5]:
## Defining the embedding model
model_name = 'sentence-transformers/all-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2' # smaller model
model = SentenceTransformer(model_name)

def job_embedding(job):
    '''
    Returns a previously saved value of the job embedding, or encodes it
    
    Arguments:
        job: String, processed job information
    
    Returns:
        encoding: Array, the encoding of the job
    '''
    ind = dict(enumerate(np.where(processed_data['job_title'] == job)[0])).get(0)
    if encoded and ind != None:
        return encoded_data['job_title'].loc[ind]
    ind = dict(enumerate(np.where(processed_data['location'] == job)[0])).get(0)
    if encoded and ind != None:
        return encoded_data['location'].loc[ind]
    encoding = model.encode(job, normalize_embeddings = True)
    return encoding

def encode_jobs():
    '''
    Encodes all preprocessed jobs and locations.
    
    Returns:
        encoded_data: Pandas dataframe, candidates encoded data
    '''
    encoded_jobs = np.array([job_embedding(job) for job in processed_jobs])
    encoded_locations = np.array([job_embedding(loc) for loc in processed_locations])
    data = np.concatenate((encoded_jobs, encoded_locations))
    enc_samples = dict( [ (i, (data[i], data[i+len(processed_connections)], processed_connections[i], 0)) for i in range(len(connections)) ] )
    encoded_data = pd.DataFrame.from_dict(enc_samples, orient = 'index', columns = ['job_title', 'location', 'log_connections', 'fit'])
    return encoded_data

## Performs all the encoding
encoded = False
encoded_data = encode_jobs()
encoded = True # Now all the data has been encoded
encoded_example = encoded_data['job_title'][0]
print('The first job title is encoded as:\n', encoded_example)
job_titles = dataframe['job_title']

The first job title is encoded as:
 [-7.44135454e-02  1.78237371e-02 -5.75934462e-02 -4.78980690e-03
 -1.00911833e-01  4.42423113e-03  2.74604410e-02  6.44031242e-02
 -1.57705564e-02  1.56021798e-02  3.69933695e-02 -2.18076371e-02
 -2.17828713e-02 -1.37398625e-02 -6.94503114e-02 -8.66924673e-02
 -5.50912283e-02  2.65113302e-02 -2.25930791e-02 -3.58368754e-02
  6.25475720e-02  1.88039858e-02 -4.58394401e-02 -6.04784004e-02
 -8.29530433e-02 -3.75333242e-02  2.68533397e-02 -1.24307737e-01
  6.33475631e-02 -3.79948653e-02  4.56899107e-02  4.86548506e-02
  5.60845509e-02  2.64487378e-02 -9.00991168e-03  5.37304878e-02
  5.03494106e-02  5.42962737e-03  1.09710850e-01  4.75321040e-02
 -6.70789108e-02 -1.35334119e-01 -6.91300184e-02 -4.61152419e-02
  3.28364894e-02  3.63205164e-03  2.32290570e-02 -4.58136685e-02
  1.44065637e-02  1.70141067e-02 -1.31228015e-01 -3.80313173e-02
  6.37280345e-02 -2.71364134e-02 -3.79742086e-02 -4.32213135e-02
  2.00519527e-04 -2.86149960e-02 -5.03062131e-03 -5.06

Defining the way similar candidates are ranked

In [6]:
def cosine_similarity(vec1, vec2):
    '''
    Returns the cosine similarity between unit vectors
    
    Arguments:
        vec1: Unit array
        vec2: Unit array
    
    Returns:
        The numpy dot product
    '''
    norm1 = np.linalg.norm(vec1); norm2 = np.linalg.norm(vec2)
    assert np.isclose(norm1,1); assert np.isclose(norm2,1)
    return np.dot(vec1, vec2)

def similar_jobs(job, location = None, connec = None, update = False):
    '''
    Returns a ranking of the candidates best matching a specific job
    
    Arguments: 
        job: String or embedding representing the job to match candidates to.
             If a string is provided, it will be encoded.
             
        location: String, a specified city or country
        
        connec: Integer, number of connections on Linkedin (Max 500)
        
        update: Boolean, if True dataframes will be sorted according to ranking
    
    Returns:
        ranking: List, a ranking of the 10 best matching candidates
    '''
    cosine_list = [] # Similarity ranking
    if type(job) == str:
        emb = job_embedding(job)
    else:
        emb = job # already encoded
    if location != None:
        emb += job_embedding(location) # account for location
    if connec != None:
        emb = np.concatenate((emb, [connec])) # account for connections
    emb = np.array(normalize([emb])[0])
    # Embedding of input job complete.
    ## Next, go through all possible candidates
    for i, tup in enumerate(zip(processed_jobs, processed_locations, processed_connections)):
        job_, loc_, con_= tup
        vec = job_embedding(job_) + (location!=None)*job_embedding(loc_)
        if connec != None:
            vec = np.concatenate((vec, [con_]))
        vec = np.array(normalize([vec])[0])
        cosine_list.append(cosine_similarity(emb, vec))
    if update:
        update_dataframe(cosine_list)
    ranking = [[dataframe.index[i]] + dataframe.values[:10].tolist()[i] for i in range(10)]
    return ranking

def update_dataframe(cosine_list):
    '''
    Updates candidates dataframe based on provided ranking information.
    
    Arguments:
        cosine_list: List, contains the job cosine similarity of each candidate
    
    Updates:
        The dataframes fit values are updated and sorted in descending order.
    '''
    for i, cos in enumerate(cosine_list):
        dataframe.loc[i, ['fit']] = cos
        processed_data.loc[i, ['fit']] = cos
        encoded_data.loc[i, ['fit']] = cos
    dataframe.sort_values(by=['fit'], ascending = False, inplace = True)
    processed_data.sort_values(by=['fit'], ascending = False, inplace = True)
    encoded_data.sort_values(by=['fit'], ascending = False, inplace = True)
    return None

def star_rank(keyword, star = [], location = None, connec = None, weights = [0.4, 0.5, 0.1]):
    '''
    Provides a candidate ranking based on job characteristics and manual 
    supervisory signal (history of starred candidates).
    
    Arguments:
        keyword: String, describes the job to match candidates to.
        star: List, indices of candidates starred in chronological order
        location: String, preferred location
        connec: Integer, number of Linkedin connections (Max 500)
        weights: List [a, b, c], importance accorded to (a+b+c=1): 
                    a: keyword provided
                        and manual supervisory signals:
                    b: last starred candidate
                    c: previously starred candidates
    
    Returns:
        ranking: List, ranking of candidates
    '''
    key_emb = job_embedding(process_string(keyword))
    key_w, star_w, prestar_w = weights # Importance accredited to keyword vs starred vs previously starred
    if len(star) > 0:
        star_emb = job_embedding(star.pop())
        if len(star) > 0:
            prestar_emb = np.average([job_embedding(s) for s in star], axis = 0)
            prestar_emb = np.squeeze(normalize(prestar_emb.reshape(1,-1), axis = 1))
            vec = key_w*key_emb + star_w*star_emb + prestar_w*prestar_emb
        else:
            vec = key_w*key_emb + star_w*star_emb
    else:
        vec = key_emb
    # Now the vector vec is an amalgation of the embeddings of the keyword + starred candidates
    vec = np.squeeze(normalize(vec.reshape(1,-1), axis = 1))
    ranking = similar_jobs(vec, location = location, connec = connec, update = True)
    return ranking

Below are the executive functions

In [7]:
star_ind = [] # List of indices of starred candidates

def reset_starring():
    'Resets / Erases all manual supervisory signals previously provided.'
    star_ind = []
    return None

def star(ind):
    '''
    Stars a candidate
    
    Arguments:
        ind: Integer, candidate index in the dataframe
    
    Performs:
        Candidate starring.
    '''
    star_ind.append(ind)
    job, loc = dataframe['job_title'][ind], dataframe['location'][ind]
    print('The candidate %s was starred with index %s'%((job, loc), ind))
    return None

def rank(keyword, location = None):
    '''
    Simplified use of candidate ranking operation.
    
    Arguments:
        keyword: String, describes the job to match candidates to.
        location: String, preferred location
                  If not provided, the location of the last candidate 
                  to be starred will be used.
    
    Returns:
        ranking: List, ranking of candidates
    '''
    print('\nThe keyword provided is: '+keyword)
    processed_starred = [processed_jobs[ind] for ind in star_ind]
    location, connec = None, None
    if len(star_ind) > 0:
        if location == None:
            location = processed_locations[star_ind[-1]]
        connec = processed_connections[star_ind[-1]]
    ranking = star_rank(keyword, star = processed_starred.copy(), location = location, connec = connec)
    print('\nRanking with fit probability:\n')
    for ranked in ranking:
        print(*zip(['index']+list(dataframe.columns), ranked))
    print('\nConsult the dataframe variable for the complete list\n')
    return ranking

In [8]:
reset_starring() ## This command clears any previous starring action

Below, the keyword can be specified

In [9]:
## Now providing the keywords
keywords = ['Aspiring human resources',  'seeking human resources']
keyword = keywords[1]

Below, a ranking based purely on the above-specified keyword is made

In [10]:
ranking_nostar = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 13) ('job_title', 'Seeking Human Resources Opportunities') ('location', 'Chicago, Illinois') ('connection', '390') ('fit', 0.9503727114909426)
('index', 47) ('job_title', 'Seeking Human Resources Position') ('location', 'Las Vegas, Nevada Area') ('connection', '48') ('fit', 0.9475734174476902)
('index', 48) ('job_title', 'Aspiring Human Resources Manager | Graduating May 2020 | Seeking an Entry-Level Human Resources Position in St. Louis') ('location', 'Cape Girardeau, Missouri') ('connection', '103') ('fit', 0.7977208679184986)
('index', 8) ('job_title', 'Seeking Human Resources HRIS and Generalist Positions') ('location', 'Greater Philadelphia Area') ('connection', '500+ ') ('fit', 0.7272268794421783)
('index', 22) ('job_title', 'Human Resources Professional') ('location', 'Greater Boston Area') ('connection', '16') ('fit', 0.6782880212393358)
('index', 42) ('job_title', 'Seeking Human  Resour

In [11]:
# Star a candidate
star(26)

The candidate ("Human Resources Generalist at Schwan's", 'Amerika Birleşik Devletleri') was starred with index 26


In [12]:
# Rank the candidates based on the keyword + starred candidate
ranking = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 26) ('job_title', "Human Resources Generalist at Schwan's") ('location', 'Amerika Birleşik Devletleri') ('connection', '500+ ') ('fit', 0.966179414774744)
('index', 1) ('job_title', 'Native English Teacher at EPIK (English Program in Korea)') ('location', 'Kanada') ('connection', '500+ ') ('fit', 0.931895269068265)
('index', 42) ('job_title', 'Seeking Human  Resources Opportunities. Open to travel and relocation.') ('location', 'Amerika Birleşik Devletleri') ('connection', '415') ('fit', 0.8527217011678001)
('index', 6) ('job_title', 'Student at Humber College and Aspiring Human Resources Generalist') ('location', 'Kanada') ('connection', '61') ('fit', 0.8187394902202373)
('index', 30) ('job_title', 'Aspiring Human Resources Professional | An energetic and Team-Focused Leader') ('location', 'Austin, Texas Area') ('connection', '174') ('fit', 0.7709154404922836)
('index', 3) ('job_title', 'People

In [13]:
# Star another candidate
star(21)

The candidate ('Aspiring Human Resources Manager, seeking internship in Human Resources.', 'Houston, Texas Area') was starred with index 21


In [14]:
# Re-rank based on the keyword + starred candidate (and learning from previously starred candidate(s))
ranking = rank(keyword)


The keyword provided is: seeking human resources

Ranking with fit probability:

('index', 21) ('job_title', 'Aspiring Human Resources Manager, seeking internship in Human Resources.') ('location', 'Houston, Texas Area') ('connection', '7') ('fit', 0.9562294330572801)
('index', 12) ('job_title', 'Aspiring Human Resources Management student seeking an internship') ('location', 'Houston, Texas Area') ('connection', '500+ ') ('fit', 0.8378351165914746)
('index', 10) ('job_title', 'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR') ('location', 'Houston, Texas Area') ('connection', '500+ ') ('fit', 0.7562853616795456)
('index', 46) ('job_title', 'Student') ('location', 'Houston, Texas Area') ('connection', '4') ('fit', 0.7276595637416854)
('index', 1) ('job_title', 'Native English Teacher at EPIK (English Program in Korea)') ('location', 'Kanada') ('connection', '500+ ') ('fit', 0.6812084567828091)
('index', 6) ('job_title', 'Stud