In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_colwidth',None)

In [10]:
df = pd.read_csv("potential.csv")

In [11]:
df.shape

(104, 5)

In [12]:
df = df.drop(columns='fit')

In [13]:
df.duplicated(keep=False).sum()


0

In [14]:
# decription of the dataset
df.describe(include='all')

Unnamed: 0,id,job_title,location,connection
count,104.0,104,104,104.0
unique,,52,41,
top,,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,Kanada,
freq,,7,12,
mean,52.5,,,262.663462
std,30.166206,,,222.602812
min,1.0,,,1.0
25%,26.75,,,47.0
50%,52.5,,,193.0
75%,78.25,,,500.0


In [15]:
# value counts of job_title column
df['job_title'].value_counts()

2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 7
Aspiring Human Resources Professional                                                                                    7
Student at Humber College and Aspiring Human Resources Generalist                                                        7
People Development Coordinator at Ryan                                                                                   6
Native English Teacher at EPIK (English Program in Korea)                                                                5
Aspiring Human Resources Specialist                                                                                      5
HR Senior Specialist                                                                                                     5
Student at Chapman University                                                                                            4
SVP, CHRO, Marke

In [16]:
# create cleaned job_title column for cleaned job
df['job_cleaned'] = df['job_title'].str.replace('Human Resources', 'HR')
df.head()

Unnamed: 0,id,job_title,location,connection,job_cleaned
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring HR professional
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500,Native English Teacher at EPIK (English Program in Korea)
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,Aspiring HR Professional
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,People Development Coordinator at Ryan
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,Advisory Board Member at Celal Bayar University


In [17]:
# removal of punctuations in job_title_cleaned
df['job_cleaned'] = df['job_cleaned'].str.replace("[^a-zA-Z\s]",'', regex=True)
df['job_cleaned'] = df['job_cleaned'].str.lower()
df.head()

Unnamed: 0,id,job_title,location,connection,job_cleaned
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,ct bauer college of business graduate magna cum laude and aspiring hr professional
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher at epik english program in korea
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,people development coordinator at ryan
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,advisory board member at celal bayar university


# Tokenize ----> Stopwords ----> Lemmetize ---> job_cleaned

In [18]:
# importing stopwords from nltk, wordnet lemmatizer, word tokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer

# instance of lemmatizer
lemmatizer = WordNetLemmatizer()

# loop to remove stopwords and lemmatize

for i in range(len(df)):
    tokenized_words = word_tokenize(df['job_cleaned'][i])
    removed_stopwords = [word for word in tokenized_words if word not in set(stopwords.words('english'))]
    lemmatized_sentence = []
    for word in removed_stopwords:
        lemmatized_sentence.append(lemmatizer.lemmatize(word))
    df['job_cleaned'][i] = TreebankWordDetokenizer().detokenize(lemmatized_sentence)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_cleaned'][i] = TreebankWordDetokenizer().detokenize(lemmatized_sentence)


In [19]:
df.head()

Unnamed: 0,id,job_title,location,connection,job_cleaned
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,ct bauer college business graduate magna cum laude aspiring hr professional
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher epik english program korea
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,people development coordinator ryan
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,advisory board member celal bayar university


In [20]:
# importing tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [21]:
jobs_list = list(df["job_cleaned"])
# transform into a vector 
jobs_list_tfidf = tfidf.fit_transform(jobs_list)

# needs to be an array to be passed into the cosign similarity 
jobs_list_tfidf.dtype

dtype('float64')

In [22]:
tfid_vector_jobs_list = jobs_list_tfidf.toarray()

In [23]:
# finding the cosine similarity between the phrases 
# vectorized difference 
sentence1 = "Aspiring human resources"
sentence2 = "seeking human resources"

In [24]:

# lemmetize the phrase 
lemmatized_sentence = []
word_tokenized = word_tokenize(sentence1.lower())
for word in word_tokenized:
    lemmatized_sentence.append(lemmatizer.lemmatize(word))
lemmatized_sentence = TreebankWordDetokenizer().detokenize(lemmatized_sentence)




# convert sentence phrase into a vector
X1 = tfidf.transform([lemmatized_sentence])
X1_vector = X1.toarray()
print('Shape of Sentence: ',X1_vector.shape)


Shape of Sentence:  (1, 176)


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [26]:
# consine similarity calculation for phrase1
# grab the similarity scores usinging cosign similarity 


similarity_score = []

for i in range(len(df)):
    similarity_score.append(cosine(X1_vector, tfid_vector_jobs_list[i]))
df['similarity_score_phrase1'] = similarity_score

df['similarity_score_phrase1'] = df['similarity_score_phrase1'].apply(lambda x: 1 - x)

sorted_df = df.sort_values(by='similarity_score_phrase1', ascending=False)
sorted_df.head()

Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1
93,94,Seeking Human Resources Opportunities. Open to travel and relocation.,Amerika Birleşik Devletleri,415,seeking human resource opportunity open travel relocation,0.553645
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815


In [27]:
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# create temp file and save converted embedding
glove_file = get_tmpfile('word2vec.6B.50d.txt')

glove2word2vec('glove.6B.50d.txt', glove_file)

# load converted embedding into memory reference model 
glove_model = KeyedVectors.load_word2vec_format(glove_file)

# saving glove model
glove_model.save_word2vec_format('word2vec.6B.50d.bin.gz', binary=True)










  glove2word2vec('glove.6B.50d.txt', glove_file)


In [28]:
def doc_token(sentence, model, sentence_vector_list, vector_dimensions):
    word_tokens = word_tokenize(sentence)
    words_being_filtered = [word for word in word_tokens if word in model.key_to_index.keys()]
    
    for i in range(len(word_tokens)):
        if word_tokens[i] in words_being_filtered:
            token_vector = model[word_tokens[i]]
        else:
            token_vector = np.zeros(vector_dimensions)
        sentence_vector_list.append(token_vector)
    
    return sentence_vector_list

In [29]:
# vectorize job title using glove model
glove_vectors = []
for i in range(len(df)):
    glove_sentence = []
    doc_token(sentence1, glove_model, glove_sentence , 50  )
    glove_vectors.append(glove_sentence)

In [30]:
# vectorize keyword phrase using glove model
glove_search_phrase = []
doc_token(lemmatized_sentence, glove_model, glove_search_phrase, 50)

[array([-4.3762e-01,  8.3236e-01,  2.9638e-01, -7.8180e-01,  2.4432e-01,
         3.2892e-01, -8.2750e-01,  1.2692e-01, -3.9246e-01,  8.6643e-01,
         2.4300e-01, -6.6664e-04,  5.2488e-01,  1.5963e+00,  3.8362e-01,
        -3.2021e-01,  1.0324e+00,  7.0235e-01,  4.1679e-01, -2.0262e-01,
         2.8378e-01,  9.4354e-01, -3.4941e-01,  1.4981e+00,  3.2488e-01,
        -6.6963e-01, -4.2229e-01, -1.3033e+00, -5.9413e-01, -7.0466e-02,
         7.0193e-01,  3.0914e-01, -4.0663e-01, -6.4972e-01,  7.7279e-01,
         3.3412e-01, -8.0891e-01,  2.4861e-02, -5.1049e-01, -8.1757e-01,
        -1.0251e-01,  5.6904e-01,  6.7411e-01, -1.9030e-01,  4.0583e-01,
        -6.1531e-01,  2.2660e-01, -1.1493e-03, -6.5260e-01,  1.8631e-01],
       dtype=float32),
 array([ 0.61854 ,  0.11915 , -0.46786 ,  0.31368 ,  1.0334  ,  0.95964 ,
         0.87803 , -1.0346  ,  1.6322  ,  0.29347 ,  0.80844 , -0.058903,
         0.021251,  0.40986 ,  0.54443 , -0.33311 ,  0.53712 , -0.35823 ,
         0.29374 ,  0.09

In [42]:
vector_similarity_glove = []
for i in range(len(df)):
    similarity = cosine(np.mean(glove_vectors[i], axis=0), np.mean(glove_search_phrase, axis=0))
    vector_similarity_glove.append(similarity)

In [43]:
glove_similarity = []
for i in range(len(df)):
    similarity = cosine(np.mean(glove_vectors[i], axis=0), np.mean(glove_search_phrase, axis=0))
    glove_similarity.append(similarity)

    
df["glove_similarity_sentence1"] = glove_similarity

df["glove_similarity_sentence1"] = df["glove_similarity_sentence1"].apply(lambda x: 1 - x)

sorted_df = df.sort_values(by = "glove_similarity_sentence1", ascending= False)

sorted_df

Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1,glove_similarity_sentence1,bert_scores,mean_scores
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,ct bauer college business graduate magna cum laude aspiring hr professional,0.056303,0.875265,0.434025,0.455198
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher epik english program korea,0.000000,0.875265,0.141922,0.339062
76,77,Human Resources|\nConflict Management|\nPolicies & Procedures|Talent Management|Benefits & Compensation,Dallas/Fort Worth Area,409,hr conflict management policy procedurestalent managementbenefits compensation,0.000000,0.875265,0.342546,0.405937
75,76,Aspiring Human Resources Professional | Passionate about helping to create an inclusive and engaging work environment,"New York, New York",212,aspiring hr professional passionate helping create inclusive engaging work environment,0.043060,0.875265,0.544917,0.487747
74,75,"Nortia Staffing is seeking Human Resources, Payroll & Administrative Professionals!! (408) 709-2621","San Jose, California",500,nortia staffing seeking hr payroll administrative professional,0.000000,0.875265,0.411955,0.429073
...,...,...,...,...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher epik english program korea,0.000000,0.875265,0.141922,0.339062
30,31,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,ct bauer college business graduate magna cum laude aspiring hr professional,0.056303,0.875265,0.434025,0.455198
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,seeking hr opportunity,0.000000,0.875265,0.699841,0.525035
28,29,Aspiring Human Resources Management student seeking an internship,"Houston, Texas Area",500,aspiring hr management student seeking internship,0.079304,0.875265,0.601866,0.518811


In [33]:
from sentence_transformers import SentenceTransformer



In [34]:
# shape being passed through NN = 384 
bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
bert_model_encoded_job = bert_model.encode(list(df["job_cleaned"]))
bert_model_encoded_job.shape

(104, 384)

In [35]:
bert_model_encoding_sentence1 = bert_model.encode(sentence1)
bert_model_encoding_sentence1.shape

(384,)

In [36]:
bert_score = []
for i in range(len(df)):
    similarity = cosine(bert_model_encoded_job[i], bert_model_encoding_sentence1)
    bert_score.append(1 - similarity)
    

df['bert_scores'] = bert_score

# viewing dataset by highest glove similarity scores
df_sorted5 = df.sort_values(by='bert_scores', ascending=False)
df_sorted5.head()

    



Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1,glove_similarity_sentence1,bert_scores
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918


In [37]:
mean_scores = np.mean(df.iloc[:, -3:], axis = 1)
df["mean_scores"] = mean_scores

sorted_df = df.sort_values(ascending= False, by= "mean_scores")
sorted_df[sorted_df["id"] == 97]

Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1,glove_similarity_sentence1,bert_scores,mean_scores
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,aspiring hr professional,0.162815,0.875265,0.803918,0.613999


In [38]:
df2 = df.copy()

In [39]:
# getting the candidates who needs to be starred or not... takes the input and chjanges the score sequencially  
star_candidate_input = input("Candidates Starred: Enter y  or n ")
star_candidate_input.lower()

starred_candidates = []
if star_candidate_input == 'y':
    starred = [int(num) for num in input("Get the candidate id that needs to be starred").split()]

Candidates Starred: Enter y  or n y
Get the candidate id that needs to be starred7 8 7 


In [40]:
starred

[7, 8, 7]

In [44]:

df2["starred"] = df["mean_scores"]

for star in starred:
    df2.loc[df2['id'] == star, 'starred'] = 1

# viewing first 5 observations of df_rank
df2.head()



Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1,glove_similarity_sentence1,bert_scores,mean_scores,starred
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,ct bauer college business graduate magna cum laude aspiring hr professional,0.056303,0.875265,0.434025,0.455198,0.455198
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher epik english program korea,0.0,0.875265,0.141922,0.339062,0.339062
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspiring hr professional,0.162815,0.875265,0.803918,0.613999,0.613999
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,people development coordinator ryan,0.0,0.875265,0.378423,0.417896,0.417896
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,advisory board member celal bayar university,0.0,0.875265,0.201885,0.35905,0.35905


# Make the Neural Network 


In [45]:
# make the model 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [46]:
class Net(nn.Module):
    def __init__(self, n_features):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 256), 
            nn.Dropout(0.5),
            nn.ReLU(inplace= True),
            nn.Linear(256, 128),
            nn.Dropout(0.5), 
            nn.ReLU(inplace = True), 
            nn.Linear(128, 64), 
            nn.ReLU(inplace = True), 
            nn.Linear(64, 1), 
            nn.Sigmoid(),
        )
        self.output_sigmoid = nn.Sigmoid()
        
    def forward(self, x, y):
        
        system1 = self.net(x)
        system2 = self.net(y)
        output = self.output_sigmoid(system1-system2)
        return output
    
    
    def predict(self, inputs):
        
        s = self.net(inputs)
        return s
      

In [47]:
row_1 = df2.sample( n=2000, replace= True)
row_2 = df2.sample( n=2000, replace= True)
job_list1 = list(row_1["job_cleaned"])
job_list2 = list(row_2["job_cleaned"])
job_list_1_encoded = bert_model.encode(job_list1)
job_list_2_encoded = bert_model.encode(job_list2)
type(job_list_1_encoded)






numpy.ndarray

In [48]:
import torch.optim as optim


In [52]:
# changing the numpy into Tensor to pass into the model 
sentence_1 = torch.from_numpy(job_list_1_encoded).float()
sentence_2 = torch.from_numpy(job_list_2_encoded).float()


print(type(sentence_1))
print(type(sentence_2))


<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [53]:

# Make the True Values for the models 
y1 = list(row_1['starred'])
y2 = list(row_2['starred'])
y = torch.Tensor([1 if y1_i > y2_i else 0.5 if y1_i == y2_i else 0 for y1_i, y2_i in zip(y1, y2)]).float()
y = y.unsqueeze(-1)



In [54]:
model = Net(n_features= 384)


In [55]:
# define model, define the optomizer , define the  loss function

optimizer = optim.Adam(model.parameters(), lr = .0001)

loss_func = nn.BCELoss()


# defining the Epochs 
epochs = 1000
losses = []


# training the system
for i in range(epochs):
    model.zero_grad()
    # model.zero_grad() ---> predict ---> loss ---> optimization ---> backward() ---> step()
    y_pred = model(sentence_1, sentence_2)
    # predict ---> loss ---> backward ---> optimizer
    loss = loss_func(y_pred, y)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if i % 100 == 1:
        print(f"Epoch {i} Loss {loss.item()}")
    
    
    


Epoch 1 Loss 0.6929510235786438
Epoch 101 Loss 0.6280466914176941
Epoch 201 Loss 0.5096757411956787
Epoch 301 Loss 0.5012503862380981
Epoch 401 Loss 0.5003368854522705
Epoch 501 Loss 0.5000662803649902
Epoch 601 Loss 0.49996417760849
Epoch 701 Loss 0.49987098574638367
Epoch 801 Loss 0.49885860085487366
Epoch 901 Loss 0.4993155896663666


In [57]:

predictions = []

for i in range(len(df2)):
    job_sentence_embedded = bert_model.encode(df2["job_cleaned"][i])
    job_sentences_to_tensor = torch.from_numpy(job_sentence_embedded).float()
    pred = round(model.predict(job_sentences_to_tensor).detach().numpy().sum(), 2)
    predictions.append(pred)

df2["Ranked_preds"] = predictions


sorted_ranked = df2.sort_values(ascending= False, by = "Ranked_preds")
    
    
sorted_ranked

Unnamed: 0,id,job_title,location,connection,job_cleaned,similarity_score_phrase1,glove_similarity_sentence1,bert_scores,mean_scores,starred,Ranked_preds
26,27,Aspiring Human Resources Management student seeking an internship,"Houston, Texas Area",500,aspiring hr management student seeking internship,0.079304,0.875265,0.601866,0.518811,0.518811,1.0
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,aspiring hr specialist,0.146835,0.875265,0.719585,0.580561,0.580561,1.0
25,26,HR Senior Specialist,San Francisco Bay Area,500,hr senior specialist,0.000000,0.875265,0.503457,0.459574,0.459574,1.0
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,seeking hr opportunity,0.000000,0.875265,0.699841,0.525035,0.525035,1.0
28,29,Aspiring Human Resources Management student seeking an internship,"Houston, Texas Area",500,aspiring hr management student seeking internship,0.079304,0.875265,0.601866,0.518811,0.518811,1.0
...,...,...,...,...,...,...,...,...,...,...,...
33,34,People Development Coordinator at Ryan,"Denton, Texas",500,people development coordinator ryan,0.000000,0.875265,0.378423,0.417896,0.417896,0.0
31,32,Native English Teacher at EPIK (English Program in Korea),Kanada,500,native english teacher epik english program korea,0.000000,0.875265,0.141922,0.339062,0.339062,0.0
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,advisory board member celal bayar university,0.000000,0.875265,0.201885,0.359050,0.359050,0.0
54,55,"SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR","Houston, Texas Area",500,svp chro marketing communication csr officer engie houston woodland energy gphr sphr,0.000000,0.875265,0.248271,0.374512,0.374512,0.0
