In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
sheet_id = '117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU'
url = 'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_id}'.format(sheet_id=sheet_id)
df = pd.read_csv(url)


In [3]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85.0,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,
3,4,People Development Coordinator at Ryan,"Denton, Texas",,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",,


In [4]:
df.loc[df['location'] == 'Kanada', 'location'] = 'Canada'


In [5]:
import gensim
from gensim.models import Word2Vec
import string



In [6]:
def remove_digits_punc(df):
  df['job_title'] = df['job_title'].str.replace('\d+', '')
  df['job_title'] = df['job_title'].str.replace('[^\w\s]', '')

  return df


df = remove_digits_punc(df)

df.head()


  df['job_title'] = df['job_title'].str.replace('\d+', '')
  df['job_title'] = df['job_title'].str.replace('[^\w\s]', '')


Unnamed: 0,id,job_title,location,connection,fit
0,1,CT Bauer College of Business Graduate Magna C...,"Houston, Texas",85.0,
1,2,Native English Teacher at EPIK English Program...,Canada,,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,
3,4,People Development Coordinator at Ryan,"Denton, Texas",,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",,


In [13]:
# Tokenization
tokenized_titles = [gensim.utils.simple_preprocess(title) for title in df['job_title']]

# Training the Word2Vec model
model = Word2Vec(sentences=tokenized_titles, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Calculate similarity with "aspiring human resources"
keyword_vector = (model.wv['aspiring'] + model.wv['human'] + model.wv['resources']) / 3

similarities = []
title_vectors = []
for title_tokens in tokenized_titles:
    title_vector = sum([model.wv[token] for token in title_tokens if token in model.wv]) / len(title_tokens)
    sim = cosine_similarity([title_vector], [keyword_vector])[0][0]
    title_vectors.append(title_vector)
    similarities.append(sim)

# Attach similarities to the dataframe and sort
df['fit'] = similarities
df_sorted = df.sort_values(by='fit', ascending=False)


In [14]:
df = df.sort_values(by='fit', ascending=False)

In [15]:
df.head(20)

Unnamed: 0,id,job_title,location,connection,fit
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,0.865508
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,0.865508
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,0.865508
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44.0,0.865508
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71.0,0.865508


In [20]:
from sklearn.ensemble import RandomForestRegressor


In [21]:
X = title_vectors
y = df['fit']


In [25]:
rf = RandomForestRegressor()
rf.fit(X, y)

In [27]:
df['predicted_fit'] = rf.predict(X)
df_sorted = df.sort_values(by='predicted_fit', ascending=False)
df_sorted.head(20)



Unnamed: 0,id,job_title,location,connection,fit,predicted_fit
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899,0.872899
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899,0.872899
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899,0.872899
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899,0.872899
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1.0,0.872899,0.872899
...,...,...,...,...,...,...
37,38,HR Senior Specialist,San Francisco Bay Area,,-0.068548,-0.068548
60,61,HR Senior Specialist,San Francisco Bay Area,,-0.068548,-0.068548
7,8,HR Senior Specialist,San Francisco Bay Area,,-0.068548,-0.068548
50,51,HR Senior Specialist,San Francisco Bay Area,,-0.068548,-0.068548


In [32]:
df_sorted.loc[df_sorted['predicted_fit'] > 0.5]

Unnamed: 0,id,job_title,location,connection,fit,predicted_fit
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,,0.460635,0.460004
74,75,Nortia Staffing is seeking Human Resources Pay...,"San Jose, California",,0.458063,0.460004
80,81,Senior Human Resources Business Partner at Hei...,"Chattanooga, Tennessee Area",455.0,0.460602,0.460004
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,,0.460635,0.460004
69,70,Retired Army National Guard Recruiter office m...,"Virginia Beach, Virginia",82.0,0.415977,0.453546
75,76,Aspiring Human Resources Professional Passion...,"New York, New York",212.0,0.413029,0.449444
64,65,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",,0.421053,0.444882
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,,0.460635,0.443868
55,56,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",,0.421053,0.434512
42,43,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",,0.421053,0.410396
