In [1]:
import gspread

import numpy as np 
import pandas as pd 
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# clean tokens
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import re

from helper import * 

### Test profiles

In [2]:
# user profiles
important_user_columns = ['Row ID', 'Woonplaats', 'School', 'Studie', 'Studie Jaar', 'Ambitie', 'Core Values', 
                          'Industry Interest', 'Technical Skills', 'Social Skills', 'Desired Skills']

users = pd.read_csv('Main Users.csv')[important_user_columns].set_index('Row ID')

# job profiles
important_job_columns = ['🔒 Row ID', 'Job Title', 'Type of Contract', 'Industry', 'Place', 'Where', 'Salary per hour',
                         'Hours per week', 'Language', 'Education', 'WhatYouLearn', 'WhatYouDo', 'WhyUs',
                         'WhoAreYou', 'Core Values']

jobs = pd.read_csv('Vacatures.csv')[important_job_columns].set_index('🔒 Row ID')

In [4]:
jobs

Unnamed: 0_level_0,Job Title,Type of Contract,Industry,Place,Where,Salary per hour,Hours per week,Language,Education,WhatYouLearn,WhatYouDo,WhyUs,WhoAreYou,Core Values
🔒 Row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5FHiPOELRT-S1QXx8J4FdQ,Full Stack Developer,Parttime,Audit,Amsterdam,Hybrid,15.0,24,"English,Dutch",WO,As a part-time full stack developer at StuDire...,"In this role, you will have the chance to lear...","At StuDirect, we are passionate about helping ...",We are looking for a motivated and talented de...,"Personal Growth,Making Impact"
aX84O.QlSjKUuSfDWM23Ng,Business Analyst,Fulltime,Food,Tilburg,Hybride,20.0,40,"English,Dutch,German",WO,As our Business Analyst (BA) you will play a k...,- Business process (re-)engineering and docume...,Your learnings from our clients and input will...,"- Perseverance, drive and a deep feeling of ow...",
I-NrC.VDRfaZSo-Az6izug,Data & Analytics Consultant,Fulltime,Food,Tilburg,Hybride,20.0,40,"English,Dutch,German",WO,Do you want to work in the team that is buildi...,- Be the linking pin between technology and bu...,Your learnings from our clients and input will...,"- Perseverance, drive and a deep feeling of ow...",
k6zDYaC5RiW85McgaCzYPA,Data Engineer,Fulltime,Food,Tilburg,Hybride,20.0,40,"English,Dutch,German",WO,Do you want to work in the team that is buildi...,- Together with the cloud and data architects ...,Your excellent engineering capabilities will b...,"- Perseverance, drive and a deep feeling of ow...",
xpEOOL14Rv-YvdNPjp1cFQ,Data & Analytics Internship,Intern,Food,Tilburg,Hybride,2.5,40,"English,Dutch,German",WO,You are involved in unlocking data from variou...,The internship will take place at our office i...,Since Sjors & Gert-Jan started working togethe...,"An enthusiastic, independent student in the di...",
a-Lr.-KjKSEaWwvYJwITNkQ,Marketing - Internship,Intern,Food,Tilburg,Hybride,2.5,40,"English,Dutch,German",WO,"As a marketer, you are first doing a market an...",The internship will take place at our office i...,Since Sjors & Gert-Jan started working togethe...,"An enthusiastic, independent student in the di...",
lyqjIAkxS2qYlpEpZsqggg,Commercial Intern London,Intern,Travelling,Amsterdam,On-site,2.5,40,"English,German,French",HBO,"As a Commercial Intern at Hotelplanner, you’ll...",- Commercial Partnerships\n- Hotel Account Man...,HotelPlanner is a leading travel technology co...,Available to start in Jan/Feb 2023\n- Preferab...,
XBW6LNUCROKzvtyrtz66AQ,Commercial Intern Amsterdam,Intern,Travelling,London,On-site,2.5,40,"English,German,French",HBO,"As a Commercial Intern at Hotelplanner, you’ll...",- Commercial Partnerships \n- Hotel Account Ma...,HotelPlanner is a leading travel technology co...,Available to start in Jan/Feb 2023\n- Preferab...,
a.W88WIUnTJasiP2DXNALVA,Marketing Internship,Intern,Subsidies,Amsterdam,On-site,2.5,40,Dutch,HBO,Imagine: using your knowledge and experience f...,A marketing intern at a subsidy company will b...,"As a subsidy company, we are dedicated to supp...",Our company is seeking a marketing intern with...,


### Actual profiles

In [360]:
# change this to the actual Glide data to test the functions
dummy_data = pythonanywhere_api_call("Kevin Tran; University of Amsterdam; Data Science; Python, \
    SQL, Machine Learning, DevOps Engineering")

# # connect to spreadsheets using json credentials downloaded from Google Cloud
# gc = gspread.service_account(filename='credentials.json')

# # open the worksheets and select the first; we don't have other worksheets
# sh = gc.open_by_key('1T2If_xR-fhQw6hFejDxdPLLnz1J0lDstTKZ1FJVNwQI') # extract key from https
# worksheet = sh.sheet1

# # fetch data and return a Pandas DataFrame
# results = worksheet.get_all_records()
# job_df = pd.DataFrame(results)
# job_df.set_index(job_df.columns[0], inplace=True)

### Functions

In [3]:
def clean(df):
    
    # make sure we don't fuck up anything
    profile = df.copy()
    
    try:
    
        # add spaces after the comma; only necessary for user profiles
        extra_whitespace = ['Core Values', 'Industry Interest', 'Technical Skills', 'Social Skills', 'Desired Skills']
        profile[extra_whitespace] = profile[extra_whitespace].str.replace(',', ', ')

    except:
        
        pass
    
    # remove English stopwords and lowercase all tokens
    english_stopwords = stopwords.words('english')
    cleaned = [str(word).lower() for word in profile if word not in english_stopwords]

    # stem words of the cleaned list
    stemmer = SnowballStemmer(language='english')
    stemmed_words = [stemmer.stem(word) for word in cleaned]
    
    # transform into string 
    output = " ".join(token for token in stemmed_words)
    
    # remove punctuation, unnecesarry whitespaces and new lines
    output = output.translate(str.maketrans('', '', string.punctuation)).replace('\n', ' ').replace('  ', ' ')
    
    return output

def match_and_rank(user, jobs, top_n):

    # create local student variable
    student = clean(user)

    # student ID, job ID, and the cosine similarity score
    rank_list = []
    
    bow_model = TfidfVectorizer()

    # iterate over all the jobs and match with the local student variable
    for index, job in jobs.iterrows():
        tf_idf = bow_model.fit_transform([student, clean(job)])
        rank_list.append((job.name, jobs.loc[job.name]['Job Title'], cosine_similarity(tf_idf)[0][1].round(2)))

    # return the top n results --> Job title: Similarity score
    sorted_list = sorted(rank_list, key=lambda s: s[2], reverse=True)
    
    return sorted_list[:top_n]

'data analytics consult fulltim food tilburg hybrid 200 40 englishdutchgerman wo do you want to work in the team that is building the next generation data platform for the foodindustry are you the linking pin between business engineering and ready to actually make an impact stop looking any further we need you  as our data analytics consultant you will play a key role in the ffateam and our clients you will be the person business stakeholders talk to when it comes to their wishes and requests based on those wishes and requests you will define requirements both on a functional and technical level and while you are at it implement and deliver the new functionality in the clients solution as well be the linking pin between technology and business whilst truly getting down to the bottom of things design develop and implement product features into the ffa standardized products and client projects anticipate identify and solve issues concerning data management to improve data quality analyze

In [13]:
users.iloc[0]

Woonplaats                                                   Amsterdam
School                                      Universiteit van Amsterdam
Studie                                                 Informatiekunde
Studie Jaar                                          3e jaar, Bachelor
Ambitie              Lorem ipsum dolor sit amet, consectetur adipis...
Core Values          Sustainability,Personal Growth,Making Impact,S...
Industry Interest             Consultancy,BlockChain,Finance,A.I.,Web3
Technical Skills     Python,Java,R,Data Analytics,Artificial Intell...
Social Skills        Effective Communication,Conflict Resolution,Re...
Desired Skills                                        Python,Java,Perl
Name: vptDoho0QCKzqTkhBxzQrA, dtype: object

### Matching

https://medium.com/mlearning-ai/enhancing-information-retrieval-via-semantic-and-relevance-matching-64973ff81818

1. Create matrices for both the user and job profiles
2. Fit a bag-of-words based model (e.g. TF-IDF)
3. Calculate the cosine similarity
4. Rank the documents

In [142]:
" ".join(word for word in list)

[('k6zDYaC5RiW85McgaCzYPA', 'Data Engineer', 0.08),
 ('I-NrC.VDRfaZSo-Az6izug', 'Data & Analytics Consultant', 0.07),
 ('aX84O.QlSjKUuSfDWM23Ng', 'Business Analyst', 0.06),
 ('xpEOOL14Rv-YvdNPjp1cFQ', 'Data & Analytics Internship', 0.06),
 ('lyqjIAkxS2qYlpEpZsqggg', 'Commercial Intern London', 0.06)]

In [14]:
str(['appel'])

"['appel']"