In [2]:
# Use numpy version 1.21
import numpy as np
from numpy.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import nltk
from nltk.corpus import treebank
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer
#from sentence_transformers import SentenceTransformer

data = pd.read_csv('potential-talents.csv')
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


# NLKT Test

In [3]:
# Tokens
ex = data['job_title'][0]
word_tokens = nltk.word_tokenize(ex)
word_tokens

# Remove punctuation

punctuation = list('¬!"£$%^&*()_+{}[]:;@#~|\,<.>/?')
punctuation.append("'")

word_tokens = [w for w in word_tokens if not w in punctuation]

# Remove numbers

word_tokens = [w for w in word_tokens if not w.isnumeric()]

# All lowercase

word_tokens = [w.lower() for w in word_tokens]

word_tokens

['c.t',
 'bauer',
 'college',
 'of',
 'business',
 'graduate',
 'magna',
 'cum',
 'laude',
 'and',
 'aspiring',
 'human',
 'resources',
 'professional']

In [4]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
word_tokens = [w for w in word_tokens if not w in stop_words]
word_tokens

['c.t',
 'bauer',
 'college',
 'business',
 'graduate',
 'magna',
 'cum',
 'laude',
 'aspiring',
 'human',
 'resources',
 'professional']

In [5]:
# Lemmatize
lem = WordNetLemmatizer()

word_lemmas = [lem.lemmatize(w) for w in word_tokens]
word_lemmas

['c.t',
 'bauer',
 'college',
 'business',
 'graduate',
 'magna',
 'cum',
 'laude',
 'aspiring',
 'human',
 'resource',
 'professional']

In [6]:
# Stem

ps = PorterStemmer()

word_stems = [ps.stem(w) for w in word_tokens]
word_stems

['c.t',
 'bauer',
 'colleg',
 'busi',
 'graduat',
 'magna',
 'cum',
 'laud',
 'aspir',
 'human',
 'resourc',
 'profession']

# ST Test

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

sens = ['My name is James', ' I am 23 years old', 'I live in the UK']

embs = model.encode(sens)

for sen, emb in zip(sens, embs):
    print('Sentence:', sen)
    print('Embedding:', emb)
    print('')

Sentence: My name is James
Embedding: [-9.01511684e-02 -1.32507868e-02  3.87960672e-02  5.86910266e-03
 -6.03657104e-02 -6.93263635e-02  1.60920948e-01 -2.63390895e-02
  7.45671242e-02 -5.80645446e-03 -7.14110062e-02 -5.42870238e-02
  6.06181584e-02  2.28828676e-02 -7.09769223e-03 -2.63211280e-02
  4.75821346e-02  1.00783043e-01 -8.63503441e-02 -8.20025206e-02
 -5.43605490e-03  2.04897486e-03 -5.27265593e-02 -5.33987656e-02
 -6.45689294e-02  2.15985384e-02  2.91622500e-03  5.62963672e-02
 -4.35764641e-02 -5.22410646e-02 -2.32097469e-02 -7.74360150e-02
  6.74616471e-02  3.99515498e-03  1.61443017e-02 -5.93233705e-02
 -9.10174772e-02  3.85154746e-02  6.97333505e-03  2.82472391e-02
 -5.39336242e-02 -3.80297080e-02  4.29379083e-02  2.74750851e-02
  6.34754979e-05  1.77999604e-02  7.67088868e-03  4.47954983e-02
  5.73809259e-02  3.07990965e-02 -8.09344575e-02 -8.72495845e-02
 -5.19435555e-02  3.16693038e-02  6.49765953e-02 -3.38829076e-03
 -4.42540944e-02  5.82441054e-02  1.87706854e-02  4.

# Cosine Similarity Score

In [11]:
# Define keyword
keyword = 'aspiring human resources'

# Tokenize keyword
keyword_tokens = nltk.word_tokenize(keyword)

# Filter and lowercase
keyword_tokens = [w for w in keyword_tokens if not w in punctuation]
keyword_tokens = [w for w in keyword_tokens if not w.isnumeric()]
keyword_tokens = [w.lower() for w in keyword_tokens]
keyword_tokens = [w for w in keyword_tokens if not w in stop_words]

# Lemmatize
keyword_lemmas = [lem.lemmatize(w) for w in keyword_tokens]

# Function to preprocess
def preprocess_text(text):
    text_tokens = nltk.word_tokenize(text)
    punctuation = list('¬!"£$%^&*()_+{}[]:;@#~|\,<.>/?')
    punctuation.append("'")
    text_tokens = [w for w in text_tokens if not w in punctuation]
    text_tokens = [w for w in text_tokens if not w.isnumeric()]
    text_tokens = [w.lower() for w in text_tokens]
    stop_words = set(stopwords.words('english'))
    text_tokens = [w for w in text_tokens if not w in stop_words]
    lem = WordNetLemmatizer()
    text_lemmas = [lem.lemmatize(w) for w in text_tokens]
    return text_lemmas

search_term = preprocess_text(keyword)
print(search_term)

['aspiring', 'human', 'resource']


In [12]:
# Using example query
print(ex)
query = preprocess_text(ex)
print(query)

2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional
['c.t', 'bauer', 'college', 'business', 'graduate', 'magna', 'cum', 'laude', 'aspiring', 'human', 'resource', 'professional']


In [13]:
# Unite them and acquire linear vectors
def union(a, b):
    u = list(set(a) | set(b))
    u = sorted(u)
    return u

united_list = union(search_term, query)
print(united_list)

['aspiring', 'bauer', 'business', 'c.t', 'college', 'cum', 'graduate', 'human', 'laude', 'magna', 'professional', 'resource']


In [14]:
# Create vectors
st_vector = []
qu_vector = []
for word in united_list:
    if word in search_term: 
        st_vector.append(1)
    else:
        st_vector.append(0)
    
    if word in query:
        qu_vector.append(1)
    else:
        qu_vector.append(0)
        
print(st_vector)
print(qu_vector)

[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [15]:
# Cosine Sim Score
cosine = np.dot(st_vector, qu_vector) / (norm(st_vector) * norm(qu_vector))
print(f'Cosine Sim Score: {cosine}')

Cosine Sim Score: 0.5000000000000001


# Final Version

In [16]:
# Above as function.

def search(st):
    st = preprocess_text(st)
    fit = []
    for i in range(len(data)):
        qu = data['job_title'][i]
        qu = preprocess_text(qu)
        un = union(st, qu)
        st_vector = []
        qu_vector = []
        for w in un:
            if w in st: 
                st_vector.append(1)
            else:
                st_vector.append(0)
    
            if w in qu:
                qu_vector.append(1)
            else:
                qu_vector.append(0)
        cosine = np.dot(st_vector, qu_vector) / (norm(st_vector) * norm(qu_vector))
        fit.append(round(cosine, 3))
    return fit

fit = search('aspiring human resources')
data['fit'] = fit
data.sort_values(by=['fit'], ascending=False)

Unnamed: 0,id,job_title,location,connection,fit
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000


In [32]:
# Encode
job_titles = list(data['job_title'])
job_titles.append('aspiring human resources')

embeddings = model.encode(job_titles)
search_term_emb = embeddings[-1]

simscores = []
for i in range(len(data)):
    X = np.array(embeddings[i])
    X = X.reshape(1, -1)
    
    Y = np.array(search_term_emb)
    Y = Y.reshape(1, -1)
    
    simscores.append(cosine_similarity(X, Y))
    
fit = []
for i in range(len(simscores)):
    fit.append(simscores[i][0][0])

data['fit'] = fit
data.sort_values(by=['fit'], ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.949807
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035


In [48]:
# Test starring method with function
# Acquire fit with starred results and multiply with search term
# Use ID = 39 ('Student at Humber College and Aspiring Human Resources Generalist')

def simscore(qu, st, star=0):
    qu_vector = model.encode(qu)
    st_vector = model.encode(st)
    
    cosine_sim = np.dot(st_vector, qu_vector) / (norm(st_vector) * norm(qu_vector))
    
    if star > 0:
        star_vector = model.encode(data['job_title'][star-1])
        cosine_star = np.dot(star_vector, qu_vector) / (norm(star_vector) * norm(qu_vector))
        cosine_sim = cosine_sim * cosine_star
        
    return cosine_sim

0.94980717


In [50]:
# Carry out function across whole database

fit = []
for i in range(len(data)):
    fit.append(simscore(data['job_title'][i], 'aspiring human resources', 39))
    
data['fit'] = fit
data.sort_values(by=['fit'], ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit
51,52,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
24,25,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
49,50,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
8,9,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
38,39,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
36,37,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.708245
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.708245
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.708245
