In [162]:
import gspread

import numpy as np 
import pandas as pd 
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# clean tokens
from nltk.corpus import stopwords
import string

from helper import * 

### User profiles

In [18]:
# this is the "input data from" Glide
dummy_data = pythonanywhere_api_call("Kevin Tran; University of Amsterdam; Data Science; Python, \
    SQL, Machine Learning, DevOps Engineering")

# preprocess data
user_data = dummy_data.split(';')

# create df
user_df = pd.DataFrame(data=[user_data], columns=['Name', 'University', 'Major', 'Skills'])

### Job profiles

In [19]:
# connect to spreadsheets using json credentials downloaded from Google Cloud
gc = gspread.service_account(filename='credentials.json')

# open the worksheets and select the first; we don't have other worksheets
sh = gc.open_by_key('1T2If_xR-fhQw6hFejDxdPLLnz1J0lDstTKZ1FJVNwQI') # extract key from https
worksheet = sh.sheet1

# fetch data and return a Pandas DataFrame
results = worksheet.get_all_records()
job_df = pd.DataFrame(results)
job_df.set_index(job_df.columns[0], inplace=True)

### Matching

https://medium.com/mlearning-ai/enhancing-information-retrieval-via-semantic-and-relevance-matching-64973ff81818

1. Create matrices for both the user and job profiles
2. Fit a bag-of-words based model (e.g. TF-IDF)
3. Calculate the cosine similarity
4. Rank the documents

In [247]:
def clean_data(df, user=True):

    if user == True:
        data = df.iloc[0, 1:].values
    else:
        data = df['Job title'].iloc[0] + ", " + df['Preferred skills'].iloc[0]

    # remove whitespaces and comma, and lowercase all tokens
    phase1 = "".join(word for word in data).split(',')
    phase1 = [word.strip().lower() for word in phase1]
    
    # remove stopwords
    dutch_stopwords_and_punctuation = stopwords.words('dutch') + [string.punctuation]
    phase2 = [word for word in phase1 if word not in dutch_stopwords_and_punctuation]

    # create final string
    output = " ".join(token for token in phase2)

    return output

def match(user, job):

    bow_model = TfidfVectorizer()
    tf_idf = bow_model.fit_transform([clean_data(user), clean_data(job, False)])

    return print('This user has a {}% match with this job profile.'.format((cosine_similarity(tf_idf)[0][1] * 100).round(2)))


In [248]:
match(user_df, job_df)

This user has a 20.95% match with this job profile.
