# TALENT SOURCING
   
In this notebook, a talent sourcing company first screens the available candidates on a set of specific job title key words to identify and rank the candidates based on their job titles. By looking at the resulting list of ranked candidates, a preferred candidate is selected and the list is ranked again based on the job title as well as the location of the preferred candidate.

Data Attributes:   
id : unique identifier for candidate (numeric)   
job_title : job title for candidate (text)   
location : geographical location for candidate (text)   
connections: number of connections candidate has, 500+ means over 500 (text)   
Output (desired target):   
fit - how fit the candidate is for the role? (numeric, probability between 0-1)   
Keywords: “Aspiring human resources” or “seeking human resources”   


In [1]:
import gensim
import pandas as pd

In [2]:
df = pd.read_csv('potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"Ä°zmir, TÃ¼rkiye",500+,


In [22]:
#replace the misspelt location entries
df.replace({'Ä°zmir, TÃ¼rkiye': 'İzmir, Türkiye', 'Kanada' : 'Canada'}, inplace=True) #replace the spelling with incorrect characters

In [23]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.497849
1,2,Native English Teacher at EPIK (English Progra...,Canada,500+,0.372279
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.927214
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.705831
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.463202


In [24]:
df.shape

(104, 5)

In [25]:
df.job_title[0]

'2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional'

In [26]:
from sentence_transformers import SentenceTransformer

In [27]:
#initialize model
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [28]:
import numpy as np

In [29]:
#create an array of the job titles
job_titl = np.array(df.job_title)
job_titl

array(['2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
       'Native English Teacher at EPIK (English Program in Korea)',
       'Aspiring Human Resources Professional',
       'People Development Coordinator at Ryan',
       'Advisory Board Member at Celal Bayar University',
       'Aspiring Human Resources Specialist',
       'Student at Humber College and Aspiring Human Resources Generalist',
       'HR Senior Specialist',
       'Student at Humber College and Aspiring Human Resources Generalist',
       'Seeking Human Resources HRIS and Generalist Positions',
       'Student at Chapman University',
       'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR',
       'Human Resources Coordinator at InterContinental Buckhead Atlanta',
       '2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
       '2019 C.T. Bau

In [30]:
#encode/embed the job titles
sentence_embeddings = model.encode(job_titl)

In [31]:
sentence_embeddings.shape

(104, 768)

In [32]:
#embed key phrase e.g. “Aspiring human resources” 
key_wd = model.encode('Aspiring human resources')
key_wd

array([ 4.86833394e-01, -5.93161099e-02,  2.59112954e+00,  7.33451188e-01,
        4.49116886e-01,  5.04019976e-01,  3.63583341e-02, -4.39902842e-01,
        7.15804875e-01, -7.55716443e-01, -4.22236502e-01,  7.80875564e-01,
        8.47344697e-01,  3.68243396e-01, -2.28280857e-01, -2.29795933e-01,
        1.86412290e-01, -4.40084875e-01,  7.81652391e-01, -1.43286061e+00,
       -5.99061847e-01,  4.20602337e-02,  8.05639744e-01, -7.93278813e-01,
        4.04325575e-01, -8.06080341e-01, -7.83224821e-01, -1.12466478e+00,
        1.02175951e-01,  2.55557477e-01, -2.97143131e-01, -6.94274247e-01,
       -1.85472786e-01, -1.10916749e-01, -4.57902282e-01,  5.98485112e-01,
        7.21237063e-01,  3.14758360e-01, -7.26479888e-02, -1.26594320e-01,
        6.04229331e-01,  2.37531707e-01,  2.55630553e-01,  6.91818118e-01,
       -1.03937173e+00, -2.71681786e-01, -8.89906704e-01, -2.00816728e-02,
        2.66709864e-01, -1.21575499e+00, -7.33241677e-01,  6.40898719e-02,
       -3.49752247e-01,  

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
#calculate cosine similarity between the key_wd and the embedded job titles.

cos_sim = cosine_similarity([key_wd], sentence_embeddings)
cos_sim

array([[0.49784946, 0.37227935, 0.92721355, 0.70583117, 0.46320152,
        0.9426096 , 0.7215086 , 0.6806127 , 0.7215086 , 0.76440686,
        0.4487623 , 0.29949862, 0.3971985 , 0.49784946, 0.49784946,
        0.37227935, 0.92721355, 0.70583117, 0.49784946, 0.37227935,
        0.92721355, 0.70583117, 0.46320152, 0.9426096 , 0.7215086 ,
        0.6806127 , 0.67100704, 0.83371305, 0.67100704, 0.83371305,
        0.49784946, 0.37227935, 0.92721355, 0.70583117, 0.46320152,
        0.9426096 , 0.7215086 , 0.6806127 , 0.7215086 , 0.76440686,
        0.4487623 , 0.29949862, 0.3971985 , 0.49784946, 0.37227935,
        0.92721355, 0.70583117, 0.46320152, 0.9426096 , 0.7215086 ,
        0.6806127 , 0.7215086 , 0.76440686, 0.4487623 , 0.29949862,
        0.3971985 , 0.49784946, 0.92721355, 0.70583117, 0.9426096 ,
        0.6806127 , 0.76440686, 0.44876224, 0.29949862, 0.3971985 ,
        0.7007382 , 0.75128293, 0.6953852 , 0.4491577 , 0.42338023,
        0.5415801 , 0.6560505 , 0.7402716 , 0.78

In [35]:
df.fit = np.squeeze(cos_sim) #np.squeeze to reduce the array dim from 2d to 1d

In [36]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.497849
1,2,Native English Teacher at EPIK (English Progra...,Canada,500+,0.372279
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.927214
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.705831
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.463202


In [115]:
candidates = df.sort_values('fit', ignore_index=True, ascending=False)
candidates.rename(columns={'fit':'job_fit'}, inplace=True)

candidates

Unnamed: 0,id,job_title,location,connection,job_fit
0,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
1,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
2,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
3,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
4,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
...,...,...,...,...,...
99,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.299499
100,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.259422
101,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.252835
102,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.164934


In [116]:
candidates.shape

(104, 5)

## Create a function to use the job title AND location information of a preferred candidate to re-rank the list

In [113]:
# the following function takes the id number of the preferred candidate from the first round and uses that candidate's
# location to rerank the candidates based on the job_title AND location

def rerank (id_num, input_df):
    '''
    input: id_num - id number of the preferred candidate from the first round of screening
    input: input_df - this is the df where the data from the first sceening round is stored
    
    output: df containing the new columns where location similarity values and the average of the job and location
            similarity values are stored
    '''
    # remove the previously created columns in the input df since they will be calculated again in this function
    df = input_df.iloc[:, :4]
    
    #preferred candidate job_title embedding
    cand_job_embedding = model.encode (df[df.id==id_num].iloc[0]['job_title'])
    #preferred candidate location embedding
    cand_loc_embedding = model.encode(df[df.id==id_num].iloc[0]['location'])
    
    # embedding of all the job titles in the input_df
    job_title_embeddings = model.encode(df.job_title)
    # embedding of all the locations in the input_df
    location_embeddings = model.encode(df.location)
    #calculate cos similarities between the location of the preferred candidate and all the candidates in the input_df
    loc_cos_sim = cosine_similarity([cand_loc_embedding], location_embeddings)
    #calculate cos similarities between the job_title of the preferred candidate and all the candidates in the input_df
    job_cos_sim = cosine_similarity([cand_job_embedding], job_title_embeddings)
    #append the cos similarities for job title to the df
    df['job_fit'] = np.squeeze(job_cos_sim)
    #append the cos similarities for location to the df
    df['location_fit'] = np.squeeze(loc_cos_sim)
    #calculate the mean between the job and location cos similarities for each candidate 
    df['job_loc_fit'] = df.iloc[:,4:].mean(axis=1)
    #sort the df based on the cos mean value
    df = df.sort_values('job_loc_fit', ignore_index = True, ascending=False)
    
    return df

In [117]:
# rerank the list based on candidate id 53
output_df = rerank(53, candidates)
output_df

Unnamed: 0,id,job_title,location,connection,job_fit,location_fit,job_loc_fit
0,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,1.000000,1.000000,1.000000
1,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,1.000000,1.000000,1.000000
2,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,1.000000,1.000000,1.000000
3,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,1.000000,1.000000,1.000000
4,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.831415,0.649568,0.740492
...,...,...,...,...,...,...,...
99,19,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.346019,0.315334,0.330677
100,44,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.346019,0.315334,0.330677
101,31,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.346019,0.315334,0.330677
102,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.346019,0.315334,0.330677
