In [None]:
import pandas as pd
import numpy as np

import joblib

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# TFIDF & Cosine similarity

1. Get tokenized (and lemmatized) job_descriptions
2. Get list of user skills
3. Calculate tfidf for both
4. Calculate cosine similarity

In [None]:
def get_tfidf(job_description_tokens):
    """Returns document matrix of job description tokens with the vectorizer"""
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    JD_tfidf = tfidf_vectorizer.fit_transform(job_description_tokens)
    
    return JD_tfidf, tfidf_vectorizer

In [None]:
def get_cos_scores(vectorizer, JD_tfidf, skills):
    """Returns the cosine similarity score for the given skills of the user"""
    skills_tfidf = vectorizer.transform(skills)
    cos_similarity_tfidf = map(lambda x: cosine_similarity(skills_tfidf, x), JD_tfidf)
    scores = list(cos_similarity_tfidf)
    scores = list(map(lambda score: score.flatten()[0], scores))
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    
    return scores, idx

In [None]:
jobsdb_df = joblib.load("JobsDB_data.pkl")
indeed_df = joblib.load("Indeed_data.pkl")
talentconnect_df = joblib.load("TalentConnect_data.pkl")

In [None]:
JD_tfidf, tfidf_vectorizer = get_tfidf(indeed_df.loc[:, "cleaned_text"])

In [None]:
joblib.dump((JD_tfidf, tfidf_vectorizer), "Indeed_tfidf.pkl")

# TFIDF & KNN

1. Same steps as above from 1-3
2. Use KNN instead of cosine similarity

In [None]:
def get_KNN_scores(vectorizer, JD_tfidf, skills):
    KNN = NearestNeighbors()
    KNN.fit(JD_tfidf)
    # skills have to be joined into 1 string for vectorizer
    NN = KNN.kneighbors(vectorizer.transform([" ".join(skills)]), n_neighbors=JD_tfidf.shape[0])
    scores = NN[0][0]
    idx = NN[1][0]
    
    return scores, idx

In [None]:
def min_max_scaling(lst):
    return (lst - min(lst)) / (max(lst) - min(lst))

def flip_knn_scores(scores):
    """To be comparable with cosine similarity score (bigger -> better)"""
    return 1 - scores

In [None]:
def get_recommendations(params):
    current_skills = params["current_skills"]
    new_skills = params["new_skills"]
    job_portal = params["job_portal"]
    job_title = params["job_title"]
    job_title = [title.lower() for title in job_title]
    all_skills = current_skills + new_skills

    if job_portal.lower() == "indeed":
        JD_tfidf, tfidf_vectorizer = joblib.load("Indeed_tfidf.pkl")
        indeed_df = joblib.load("Indeed_data.pkl")
        
        # get the scores for both similarity measures
        cosine_scores, cosine_idx = get_cos_scores(tfidf_vectorizer, JD_tfidf, all_skills)
        knn_scores, knn_idx = get_KNN_scores(tfidf_vectorizer, JD_tfidf, all_skills)

        # normalize the scores before combining
        cosine_scores = min_max_scaling(cosine_scores)
        knn_scores = min_max_scaling(knn_scores)
        knn_scores = flip_knn_scores(knn_scores)

        # for cosine
        df_with_cosine_scores = indeed_df.assign(score=cosine_scores)
        indeed_df_cosine = df_with_cosine_scores.iloc[cosine_idx,:]

        # for knn
        indeed_df_sorted = indeed_df.iloc[knn_idx,:]
        indeed_df_knn = indeed_df_sorted.assign(score=knn_scores)

        # get average scores for both methods
        cosine_df_scores = indeed_df_cosine.loc[:, "score"]
        combined_df = indeed_df_knn.join(cosine_df_scores, rsuffix="_cosine")
        combined_df["average_score"] = combined_df.apply(lambda x: (x["score"] + x["score_cosine"]) / 2, axis=1)
        sorted_df = combined_df.sort_values("average_score", ascending=False)
        
        # filter for 
        filtered_df = sorted_df[sorted_df["normalized_title"].isin(job_title)]
        
        return filtered_df.reset_index(drop=True).to_json()
    
    else:
        return "No such method"

In [None]:
params = {"current_skills": ["Python", "Tableau"], "new_skills": ["R", "Azure"],
          "job_portal": "Indeed", "job_title": ["Data Scientist", "machine learning engineer"]}
pd.read_json(get_recommendations(params))

# TEST API

In [2]:
import joblib
import requests
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
def get_cos_scores(vectorizer, JD_tfidf, skills):
    """Returns the cosine similarity score for the given skills of the user"""
    skills_tfidf = vectorizer.transform(skills)
    cos_similarity_tfidf = map(lambda x: cosine_similarity(skills_tfidf, x), JD_tfidf)
    scores = list(cos_similarity_tfidf)
    scores = list(map(lambda score: score.flatten()[0], scores))
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    
    return scores, idx

def get_KNN_scores(vectorizer, JD_tfidf, skills):
    KNN = NearestNeighbors()
    KNN.fit(JD_tfidf)
    NN = KNN.kneighbors(vectorizer.transform([" ".join(skills)]), n_neighbors=JD_tfidf.shape[0])
    scores = NN[0][0]
    idx = NN[1][0]
    
    return scores, idx

def min_max_scaling(lst):
    return (lst - min(lst)) / (max(lst) - min(lst))

def flip_knn_scores(scores):
    """To be comparable with cosine similarity score (bigger -> better)"""
    return 1 - scores

def get_recommendations():
    current_skills = params["current_skills"]
    new_skills = params["new_skills"]
    job_portal = params["job_portal"]
    job_title = params["job_title"]
    job_title = [title.lower() for title in job_title]
    all_skills = current_skills + new_skills

    if job_portal.lower() == "indeed":
        JD_tfidf, tfidf_vectorizer = joblib.load("Indeed_tfidf.pkl")
        indeed_df = joblib.load("Indeed_data.pkl")
        
        # get the scores for both similarity measures
        cosine_scores, cosine_idx = get_cos_scores(tfidf_vectorizer, JD_tfidf, all_skills)
        knn_scores, knn_idx = get_KNN_scores(tfidf_vectorizer, JD_tfidf, all_skills)

        # normalize the scores before combining
        cosine_scores = min_max_scaling(cosine_scores)
        knn_scores = min_max_scaling(knn_scores)
        knn_scores = flip_knn_scores(knn_scores)

        # for cosine
        df_with_cosine_scores = indeed_df.assign(score=cosine_scores)
        indeed_df_cosine = df_with_cosine_scores.iloc[cosine_idx,:]

        # for knn
        indeed_df_sorted = indeed_df.iloc[knn_idx,:]
        indeed_df_knn = indeed_df_sorted.assign(score=knn_scores)

        # get average scores for both methods
        cosine_df_scores = indeed_df_cosine.loc[:, "score"]
        combined_df = indeed_df_knn.join(cosine_df_scores, rsuffix="_cosine")
        combined_df["average_score"] = combined_df.apply(lambda x: (x["score"] + x["score_cosine"]) / 2, axis=1)
        sorted_df = combined_df.sort_values("average_score", ascending=False)
        
        # filter for job titles
        filtered_df = filter_job_title(sorted_df, job_title)
        
        return filtered_df.reset_index(drop=True).to_json()
    
    else:
        return "No such method"

In [None]:
from gensim.models.word2vec import Word2Vec
import pandas as pd
import copy
import numpy as np
from numpy.linalg import norm


# get user skills vector
def user_skill_vector(user_skills, model):
    for i in range(len(user_skills)):
        user_skills[i] = user_skills[i].lower()

    user_vector = None
    for skill in user_skills:
        words_of_skills = skill.split(" ")
        for word in words_of_skills:
            if word in model.wv.index_to_key:
                word_vector = copy.deepcopy( model.wv.get_vector(word) )
                if user_vector is None:
                    user_vector = word_vector
                else:
                    user_vector += word_vector

    return user_vector


# get cosine similarity
def get_similarity(vector1,vector2):
    return np.dot(vector1,vector2)/(norm(vector1)*norm(vector2))


# recommendation engine
def recommend(user_vector, job_vectors_string):
    similar_values = []
    for job_vector_string in job_vectors_string:
        job_vector_string_list = job_vector_string.split(" ")
        job_vector = []
        for num_string in job_vector_string_list:
            if num_string=="" or num_string=="[" or num_string=="]":
                continue
            job_vector.append(float(num_string.replace("\n","").replace("]","").replace("[","")))
        # job_vector = list(map(float,job_vector[1:-1]))
        # print("job user:",job_vector)
        # print("type of user skill:", type(user_vector))
        similar_value = get_similarity(job_vector,user_vector)
        similar_values.append(similar_value)

    indexs = np.argsort(-np.array(similar_values))
    # num = min(items_num,  len(indexs))
    return indexs


# filter based on job title
def filter_job_title(df, job_title):
    # if len(job_title)==0:
    #     return df
    # for job in job_title:
    #     if job.lower() == "all":
    #         return df
    # indexs = []
    # for i in df.index:
    #     for title in job_title:
    #         if title.lower() in df.iloc[i]["job-title"].lower():
    #             indexs.append(i)
    #             continue
    # return df.iloc[indexs]
    return df[df["normalized_title"].isin(job_title)]


# filter based on industries
def filter_industry(df, industries):
    if len(industries)==0:
        return df
    for industry in industries:
        if industry.lower() == "all":
            return df
    indexs = []
    for i in df.index:
        for industry in industries:
            if df.iloc[i][industry]:
                indexs.append(i)
                continue
    return df.iloc[indexs]



model = Word2Vec.load("words200_v2.model")

# read in vector_added data
# df_data = pd.read_csv('vector_added200.csv')
df_data = joblib.load("jobsdb_data.pkl")
# select from a list of skills
# equipped_skills = ["PowerBI", "Tableau"]
equipped_skills = [ "python", "SQL"]
new_skills = ["Powerbi", "tableau"]
all_skills = equipped_skills + new_skills
# select from a list of job title, (add "all" in front-end)
job_title = ["data analyst", "data engineer"]
# select from a list of industries, (add "all" in front-end)
industries = ["Finance", "Healthcare"]

# job title filter
df_data_job = filter_job_title(df_data, job_title).reset_index()

# industries filter
df_data_filter_all = filter_industry(df_data_job, industries).reset_index()

# decided on the number of recommendations
items_num = 50

# output user equipped skills vectors
# user_equipped_vector = user_skill_vector(equipped_skills, model)

# output user new skills vectors
# user_new_vector = user_skill_vector(new_skills, model)

user_vector = user_skill_vector(all_skills, model)

# recommend job listings based on user already equipped skills
# indexs = recommend(user_equipped_vector, items_num, df_data_filter_all["job_vector"])

# recommend job listings based on new skill user would like to learn
# indexs = recommend(user_new_vector, items_num, df_data_filter_all["job_vector"])

indexs = recommend(user_vector, df_data_job["job_vector"])

recommendations = df_data.iloc[indexs, [0,1,2,3,4,5]]
# print(urls_list.to_list())
recommendations.head()

In [3]:
params = {"current_skills": ["machine learning"], "new_skills": ["Azure"],
            "job_portal": "indeed", "job_title": ["machine learning engineer", "data scientist"]}
pd.read_json(get_recommendations())

NameError: name 'get_recommendations' is not defined

In [None]:
url1 = "http://127.0.0.1:8080"
r1 = requests.get(url1)
r1

In [None]:
r1.text

In [27]:
url2 = "http://127.0.0.1:8080/recommendations"
person1_skills = {"current_skills": ["machine learning"], "new_skills": ["Azure"],
                  "job_portal": "indeed", "job_title": ["data analyst"]}
person2_skills = {"current_skills": ["python", "R", "SQL"], "new_skills": ["tableau", "powerBI"],
                  "job_portal": "jobsdb", "job_title": ["machine learning engineer"]}
r2 = requests.post(url2, json=person2_skills)
r2

<Response [200]>

In [28]:
pd.read_json(r2.text)

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

Unnamed: 0,job-title,company,location,Job description,url,date
116,Data Visualization & Analytics Lead - Associat...,JP Morgan Chase & Co,Singapore,This position is created to deliver data visua...,https://sg.jobsdb.com/job/Associate-06e3a83ad6...,2022-10-14
104,Data Analytics Manager (Internal Audit),Morgan McKinley,Raffles Place,Responsibilities With the use of data analyti...,https://sg.jobsdb.com/job/Data-Analytics-Manag...,2022-10-18
195,Analytics Manager,Vict Pte. Ltd,Singapore,This is an exciting opportunity for an experie...,https://sg.jobsdb.com/job/Analytics-Manager-36...,2022-09-27
131,"Associate Director, ESG Data Solutions & Analy...",Allegis Global Solutions,Singapore,About Standard Chartered We're an internation...,https://sg.jobsdb.com/job/Associate-Director-6...,2022-10-16
26,Lecturer / Senior Lecturer / Associate Profess...,Singapore University of Social Sciences,Singapore,Job Description We are interested in applicant...,https://sg.jobsdb.com/job/Lecturer-7b755ef374a...,2022-03-10
...,...,...,...,...,...,...
44,"Customer Engineer, Business Intelligence and A...",Google,Singapore,Minimum qualifications: Experience in a data ...,https://sg.jobsdb.com/job/Customer-Engineer-65...,2022-09-21
34,6606 - Senior Sales Engineer [ Oil & Gas / Eng...,The Supreme HR Advisory,Central Singapore,Senior Sales Engineer X2 (Oil & Gas Division) ...,https://sg.jobsdb.com/job/Senior-Sales-Enginee...,2022-04-10
43,"Senior Professional Officers, Business Analytics",CTES Consulting,Jurong Town,JOB DESCRIPTION Our client is actively search...,https://sg.jobsdb.com/job/Senior-Professional-...,2022-03-10
32,Contract Data Scientist - Business Intelligenc...,Infineon Technologies,Kallang,Part of your life. Part of tomorrow. Infineon ...,https://sg.jobsdb.com/job/Contract-Data-Scient...,2022-07-10


In [26]:
url3 = "http://0.0.0.0:8080/recommendations"
r3 = requests.post(url3, json=person2_skills)
r3.

{'Content-Length': '425342', 'Content-Type': 'text/html; charset=utf-8', 'Date': 'Fri, 28 Oct 2022 09:50:55 GMT', 'Server': 'waitress'}

In [20]:
pd.read_json(r3.text)

  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  new_data = to_datetime(new_data, errors="raise", unit=date_unit)


Unnamed: 0,job-title,company,location,Job description,url,date
64,"Insights Analyst, Business Marketing, APAC*",TikTok,Singapore,Responsibilities Partner with cross-functiona...,https://sg.jobsdb.com/job/Insights-Analyst-cfd...,2022-10-10
84,Senior Adobe Analytics Consultant,NTT,Kallang,Duties Involve in different ways to impact bu...,https://sg.jobsdb.com/job/Senior-Adobe-Analyti...,2022-08-10
23,Digital Business Analyst (Data Analytics) #Job...,Government Technology Agency,Singapore,The Government Technology Agency (GovTech) see...,https://sg.jobsdb.com/job/Digital-Business-Ana...,2022-10-10
40,"Teaching Assistant, Business Analytics and Inf...",National University of Singapore,Singapore,Job Description The Department of Information ...,https://sg.jobsdb.com/job/Teaching-Assistant-3...,2022-09-30
102,[ Assistant Librarian - Digital Technology and...,Nanyang Technological University,Singapore,Libraries worldwide play a fundamental role in...,https://sg.jobsdb.com/job/Assistant-Librarian-...,2022-11-10
...,...,...,...,...,...,...
83,Data Analytics Consultant,Total eBiz Solutions Pte Ltd,Kampong Ubi,Scope of work: Translate user requirements in...,https://sg.jobsdb.com/job/Data-Analytics-Consu...,2022-11-10
22,Business Analytics Analyst##WorkNow,Citibank,Singapore,The Business Analytics Int Analyst is a develo...,https://sg.jobsdb.com/job/Business-Analytics-A...,2022-09-30
104,Data Analytics Manager (Internal Audit),Morgan McKinley,Raffles Place,Responsibilities With the use of data analyti...,https://sg.jobsdb.com/job/Data-Analytics-Manag...,2022-10-18
65,"Vice President, Analytics Translator",OCBC Bank,Singapore,OVERALL ROLE PURPOSE: Working within the Gr...,https://sg.jobsdb.com/job/Analytics-Translator...,2022-10-17
