In [1]:
import gspread

import numpy as np 
import pandas as pd 
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# clean tokens
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import re

from helper import * 

### Test profiles

In [3]:
# user profiles
important_user_columns = ['Row ID', 'Woonplaats', 'School', 'Studie', 'Studie Jaar', 'Ambitie', 'Core Values', 
                          'Industry Interest', 'Technical Skills', 'Social Skills', 'Desired Skills']

users = pd.read_csv('Main Users.csv')[important_user_columns].set_index('Row ID')

# job profiles
important_job_columns = ['🔒 Row ID', 'Job Title', 'Type of Contract', 'Industry', 'Place', 'Where', 'Salary per hour',
                         'Hours per week', 'Language', 'Education', 'WhatYouLearn', 'WhatYouDo', 'WhyUs',
                         'WhoAreYou', 'Core Values']

jobs = pd.read_csv('Vacatures.csv')[important_job_columns].set_index('🔒 Row ID')

### Actual profiles

In [360]:
# change this to the actual Glide data to test the functions
# dummy_data = pythonanywhere_api_call("Kevin Tran; University of Amsterdam; Data Science; Python, \
#     SQL, Machine Learning, DevOps Engineering")

# # connect to spreadsheets using json credentials downloaded from Google Cloud
# gc = gspread.service_account(filename='credentials.json')

# # open the worksheets and select the first; we don't have other worksheets
# sh = gc.open_by_key('1T2If_xR-fhQw6hFejDxdPLLnz1J0lDstTKZ1FJVNwQI') # extract key from https
# worksheet = sh.sheet1

# # fetch data and return a Pandas DataFrame
# results = worksheet.get_all_records()
# job_df = pd.DataFrame(results)
# job_df.set_index(job_df.columns[0], inplace=True)

### Functions

In [141]:
def clean(df):
    
    # make sure we don't fuck up anything
    profile = df.copy()
    
    try:
    
        # add spaces after the comma; only necessary for user profiles
        extra_whitespace = ['Core Values', 'Industry Interest', 'Technical Skills', 'Social Skills', 'Desired Skills']
        profile[extra_whitespace] = profile[extra_whitespace].str.replace(',', ', ')

    except:
        
        pass
    
    # remove English stopwords and lowercase all tokens
    english_stopwords = stopwords.words('english')
    cleaned = [str(word).lower() for word in profile if word not in english_stopwords]

    # stem words of the cleaned list
    stemmer = SnowballStemmer(language='english')
    stemmed_words = [stemmer.stem(word) for word in cleaned]
    
    # transform into string 
    output = " ".join(token for token in stemmed_words)
    
    # remove punctuation, unnecesarry whitespaces and new lines
    output = output.translate(str.maketrans('', '', string.punctuation)).replace('\n', ' ').replace('  ', ' ')
    
    return output

def match_and_rank(user, jobs, top_n):

    # create local student variable
    student = clean(user)

    # student ID, job ID, and the cosine similarity score
    rank_list = []
    
    bow_model = TfidfVectorizer()

    # iterate over all the jobs and match with the local student variable
    for index, job in jobs.iterrows():
        tf_idf = bow_model.fit_transform([student, clean(job)])
        rank_list.append((job.name, jobs.loc[job.name]['Job Title'], cosine_similarity(tf_idf)[0][1].round(2)))

    # return the top n results --> Job title: Similarity score
    sorted_list = sorted(rank_list, key=lambda s: s[2], reverse=True)
    
    return sorted_list[:top_n]

### Matching

https://medium.com/mlearning-ai/enhancing-information-retrieval-via-semantic-and-relevance-matching-64973ff81818

1. Create matrices for both the user and job profiles
2. Fit a bag-of-words based model (e.g. TF-IDF)
3. Calculate the cosine similarity
4. Rank the documents

In [142]:
match_and_rank(users.iloc[0], jobs, 5)    

[('k6zDYaC5RiW85McgaCzYPA', 'Data Engineer', 0.08),
 ('I-NrC.VDRfaZSo-Az6izug', 'Data & Analytics Consultant', 0.07),
 ('aX84O.QlSjKUuSfDWM23Ng', 'Business Analyst', 0.06),
 ('xpEOOL14Rv-YvdNPjp1cFQ', 'Data & Analytics Internship', 0.06),
 ('lyqjIAkxS2qYlpEpZsqggg', 'Commercial Intern London', 0.06)]