In [1]:
# Import Libraries
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [2]:
# Import Job View
job_view_df = pd.read_csv('job_view.csv')
job_view_df.head(5)

Unnamed: 0,applicant,job
0,856312b2-78ea-4788-a814-412d5b6c76fe,b30044e3-fb9c-40ab-b7d1-c6df50251a64
1,7d6e8650-ab4c-4d5a-ba17-2d3c003b74f8,c721a742-1f75-42c2-9038-8fe380fe9e33
2,798e21d9-dc5a-44d3-93b5-517c83e4ad5d,7565155d-78a0-4168-92da-e3f685e52cb8
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685
4,0d7dc2eb-848f-4824-90fd-e0b8ede397bb,ff936b5c-47f8-4e19-b762-607e377f9fa9


In [3]:
# Import Job Application
job_application_df = pd.read_csv('job_application.csv')
job_application_df.head(5)

Unnamed: 0,applicant,job
0,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685
1,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9ccf8308-260e-4e87-abd1-cac5da434a98
2,a57353e8-e85b-498c-8d60-d1d9a485e1b0,6c8dd9d9-0e23-43de-b89a-f676646c594b
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,99eb4ff0-400e-4e65-ba60-6b8f39145870
4,a57353e8-e85b-498c-8d60-d1d9a485e1b0,c87ea47d-2ebc-4d0f-8014-c2de9ed1e850


In [4]:
# Define event in each list and combine
job_view_df['eventType'] = 'view'
job_application_df['eventType'] = 'application'
interactions_df = job_view_df.append(job_application_df)
interactions_df.head(5)

Unnamed: 0,applicant,job,eventType
0,856312b2-78ea-4788-a814-412d5b6c76fe,b30044e3-fb9c-40ab-b7d1-c6df50251a64,view
1,7d6e8650-ab4c-4d5a-ba17-2d3c003b74f8,c721a742-1f75-42c2-9038-8fe380fe9e33,view
2,798e21d9-dc5a-44d3-93b5-517c83e4ad5d,7565155d-78a0-4168-92da-e3f685e52cb8,view
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685,view
4,0d7dc2eb-848f-4824-90fd-e0b8ede397bb,ff936b5c-47f8-4e19-b762-607e377f9fa9,view


In [5]:
# Introduce Event Weight
event_type_strength = {
   'view': 1.0,
   'application': 2.0, 
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [59]:
# Define Smooth User Experience
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_df \
                    .groupby(['applicant', 'job'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 60791


Unnamed: 0,applicant,job,eventStrength
0,000487e0-55f9-46a8-a25a-4bb9c13e575e,20bbca98-05c1-4b39-8466-d42efbe918ae,1.0
1,000487e0-55f9-46a8-a25a-4bb9c13e575e,325b9956-aad9-42c8-934a-c8d486aa42ca,1.0
2,000487e0-55f9-46a8-a25a-4bb9c13e575e,d81da1e6-34a2-431b-a4f8-fbce65b234a6,1.0
3,000487e0-55f9-46a8-a25a-4bb9c13e575e,f00a7243-15a4-4995-b30a-cb915b8c707a,1.0
4,00471d37-f357-412c-a06f-a1ee1dde7571,10bc4bae-0155-4ff8-95ff-abb29a14163e,1.0
5,00471d37-f357-412c-a06f-a1ee1dde7571,68228002-fd59-468d-8e23-d2bc6b63bd63,1.0
6,00471d37-f357-412c-a06f-a1ee1dde7571,87946b49-f05d-463b-a25b-e02c249054cd,1.0
7,00471d37-f357-412c-a06f-a1ee1dde7571,c36ed6cc-637f-4e99-9101-b3665e60ef8b,1.0
8,00471d37-f357-412c-a06f-a1ee1dde7571,fca8fdca-b180-44cf-8bad-da045885214c,1.0
9,0049339a-4620-4db6-b7b9-063abdfb5b87,2875a7b0-3c51-46d3-8192-0ecd51a5e94f,1.0


In [9]:
# Check the distribution of eventStrength
count = interactions_full_df.groupby(['eventStrength']).size() 
print(count)

eventStrength
1.000000    57791
1.584963      791
2.000000     2209
dtype: int64


In [70]:
## Collaborative Filtering ##

# Matrix Factorization

# Creating a sparse pivot table with users in rows and items in columns
applicant_job_pivot_matrix_df = interactions_full_df.pivot(index='applicant', 
                                                          columns='job', 
                                                          values='eventStrength').fillna(0)

applicant_job_pivot_matrix_df.shape

(5754, 7206)

In [71]:
applicant_job_pivot_matrix = applicant_job_pivot_matrix_df.values
applicant_job_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [72]:
applicant_ids = list(applicant_job_pivot_matrix_df.index)
applicant_ids[:10]

['000487e0-55f9-46a8-a25a-4bb9c13e575e',
 '00471d37-f357-412c-a06f-a1ee1dde7571',
 '0049339a-4620-4db6-b7b9-063abdfb5b87',
 '00508393-df8e-416e-8d94-07c4201bef73',
 '005a16c1-6bd7-4027-90a9-03b9194d7cb7',
 '00615048-808f-4ca1-8e0f-12de3d125556',
 '00618bcb-a5fd-474c-9abd-76beb0b88482',
 '006286eb-3027-423b-b560-d41bad44d623',
 '00683440-8d42-40a2-a85e-3934fc423983',
 '007d618c-361c-4693-a42f-06daffa5c779']

In [73]:
job_ids = list(applicant_job_pivot_matrix_df.columns)
job_ids[:10]

['00012e5d-3b1a-43ab-8d1e-95518826fe32',
 '0003c633-23ef-4619-abd8-d12792273271',
 '0008d561-e5e2-483c-bc5a-f8fe59362346',
 '000dfaa0-fd2b-4f65-9872-399bf7a5b93f',
 '000ecf1e-c0f3-42c8-beb1-e245f837f03e',
 '00279eab-8ebd-4173-bce5-4ce484e60c02',
 '003485f8-c770-402a-8a63-8ee68a41412e',
 '003557ea-1870-4062-8211-6aae9b0a8815',
 '004616f3-1b65-4393-a45a-97a2f92c9ee7',
 '006c44a8-7abb-4724-8f5e-f851bba8c3ce']

In [111]:
#The number of factors to factor the applicant-job matrix.
NUMBER_OF_FACTORS_MF = 240
#Performs matrix factorization of the original applicant-job matrix
U, sigma, Vt = svds(applicant_job_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [112]:
U.shape

(5754, 120)

In [113]:
Vt.shape

(120, 7206)

In [114]:
sigma = np.diag(sigma)
sigma.shape

(120, 120)

In [115]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 6.27199156e-04, -1.36223753e-03, -1.57108002e-03, ...,
         1.92290409e-03, -8.55859507e-03, -2.64100599e-03],
       [-3.24556154e-04, -2.87128914e-02,  1.46725289e-02, ...,
         7.57453765e-03, -1.91475991e-02,  1.68947839e-03],
       [-3.37100862e-03, -3.78477141e-02, -1.27790957e-03, ...,
         9.73811056e-04, -8.64016883e-04, -1.72202592e-03],
       ...,
       [ 2.18216677e-04,  1.20914393e-02, -1.08254024e-02, ...,
         2.90592138e-05,  5.60662557e-03,  4.26152304e-03],
       [-3.61623339e-04, -1.59049529e-02,  5.74295378e-04, ...,
        -9.28664458e-05, -5.14309621e-03,  8.16575586e-04],
       [-7.61738442e-04,  9.67438386e-01,  1.20810194e-02, ...,
         7.73312431e-03, -3.07827232e-03, -5.76144989e-03]])

In [116]:
# Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = job_ids, index = applicant_ids)
cf_preds_df.head(10)

Unnamed: 0,00012e5d-3b1a-43ab-8d1e-95518826fe32,0003c633-23ef-4619-abd8-d12792273271,0008d561-e5e2-483c-bc5a-f8fe59362346,000dfaa0-fd2b-4f65-9872-399bf7a5b93f,000ecf1e-c0f3-42c8-beb1-e245f837f03e,00279eab-8ebd-4173-bce5-4ce484e60c02,003485f8-c770-402a-8a63-8ee68a41412e,003557ea-1870-4062-8211-6aae9b0a8815,004616f3-1b65-4393-a45a-97a2f92c9ee7,006c44a8-7abb-4724-8f5e-f851bba8c3ce,...,ff982f10-9989-4dcd-a855-415a9fd290ad,ffa40787-8baa-4609-9dec-0aa578cb26f6,ffc5dfef-82c1-44d6-b6bf-cd53b7d0d12c,ffc6b005-9295-4579-b7b2-ef7d5ce3ad5f,ffc9c5d3-85f0-4f2b-a861-c2622a539bd9,ffd3f67e-86b7-4b3a-8861-74b1e315a978,ffde7b9b-0a19-40f0-ac72-823b38073d5d,ffe5e5ec-f16b-4d24-a2c0-133203d46ab5,fff06470-353a-4db2-a1d3-4b70351a37b6,fff302aa-68ad-464f-88d7-83d8ca4ed639
000487e0-55f9-46a8-a25a-4bb9c13e575e,0.000627,-0.001362,-0.001571,-0.000117,2.2e-05,0.002264,0.000209,2.4e-05,0.001436,-0.00097,...,-0.002198,-0.002582,-0.002118,-0.001459,-0.000749,-0.000206,-0.000592,0.001923,-0.008559,-0.002641
00471d37-f357-412c-a06f-a1ee1dde7571,-0.000325,-0.028713,0.014673,-0.000472,0.000267,-0.001386,-0.000282,-0.001307,0.000687,-0.002115,...,0.001478,-0.000279,-0.000216,-0.005743,0.010237,-0.000705,-6e-05,0.007575,-0.019148,0.001689
0049339a-4620-4db6-b7b9-063abdfb5b87,-0.003371,-0.037848,-0.001278,0.000979,0.001041,0.001071,-0.000116,0.0044,0.002174,-0.002818,...,0.006908,-0.008121,0.003499,-0.002202,0.011446,0.004461,0.000759,0.000974,-0.000864,-0.001722
00508393-df8e-416e-8d94-07c4201bef73,-0.002148,-0.083614,-0.013853,-0.003344,-0.00267,0.005327,0.001988,0.002775,-0.00322,0.006623,...,0.003107,0.005239,-0.011959,0.009709,-0.022335,-0.00162,0.003986,-0.002995,0.01583,0.016281
005a16c1-6bd7-4027-90a9-03b9194d7cb7,-0.000164,0.006773,0.000958,-1.6e-05,8.5e-05,0.000285,0.000439,0.000525,-0.000814,0.001358,...,-0.000817,-0.000563,-4.1e-05,0.001133,0.000111,0.000335,-0.005013,0.000675,-0.003527,0.002579
00615048-808f-4ca1-8e0f-12de3d125556,-0.000825,-0.012546,0.002297,6.4e-05,-0.000119,-0.003039,-0.00163,0.00082,0.003378,-0.004724,...,0.000365,-0.012002,0.001809,-0.011204,0.001015,-0.001794,-0.003433,0.001208,-0.001126,-0.009456
00618bcb-a5fd-474c-9abd-76beb0b88482,-0.000568,-0.008301,0.015046,-0.000275,0.00016,-0.000679,0.000309,0.001419,0.00399,0.002436,...,-0.002579,-0.008828,-0.002974,0.006109,-0.022717,-6.8e-05,-0.005882,0.008477,0.023425,0.009937
006286eb-3027-423b-b560-d41bad44d623,-5.4e-05,0.002945,0.000237,1.8e-05,3.5e-05,0.000138,0.000279,0.000623,-0.000614,0.000526,...,-0.000783,-1.3e-05,6.1e-05,0.000902,0.000602,1.2e-05,-0.002647,-0.000643,-0.002008,0.001439
00683440-8d42-40a2-a85e-3934fc423983,-7e-06,-0.020632,0.009829,-0.000314,-0.000131,-0.001444,0.000839,-5.5e-05,-0.000459,0.000342,...,-0.00257,0.009725,0.001013,-0.00341,-0.009602,0.000364,-0.000462,0.002177,0.00901,0.006947
007d618c-361c-4693-a42f-06daffa5c779,0.000433,0.094843,0.075414,-0.004395,-0.002661,-0.005109,0.003074,-0.00254,-0.002715,-0.003304,...,0.003463,0.007794,-0.005527,0.008599,-0.002627,-0.00272,-0.002008,0.003627,-0.012344,-0.004051


In [117]:
# Reshape the Matrix to List
cf_preds_list = cf_preds_df.reset_index()
cf_preds_list = cf_preds_list.melt(id_vars="index",var_name="job", value_name="predicted rating")
cf_preds_list.columns = ['applicant','job','predicted rating']

In [118]:
# Reduce the List to a single applicant (000487e0-55f9-46a8-a25a-4bb9c13e575e) and get the predicted rating
def get_single_applicant_pred(x):
    cf_preds_list_single = cf_preds_list[cf_preds_list['applicant']==x]
    cf_preds_list_single = cf_preds_list_single.sort_values(by=['predicted rating'],ascending=False)
    return cf_preds_list_single
cf_preds_list_single = get_single_applicant_pred("000487e0-55f9-46a8-a25a-4bb9c13e575e")
cf_preds_list_single.head(10)

Unnamed: 0,applicant,job,predicted rating
38977596,000487e0-55f9-46a8-a25a-4bb9c13e575e,f00a7243-15a4-4995-b30a-cb915b8c707a,0.227628
644448,000487e0-55f9-46a8-a25a-4bb9c13e575e,04d77c5f-19c7-427e-bc95-fd7dcb996e09,0.198704
38476998,000487e0-55f9-46a8-a25a-4bb9c13e575e,ed967b82-87c2-499e-8af0-ef43e68a0f3f,0.185919
35024598,000487e0-55f9-46a8-a25a-4bb9c13e575e,d81da1e6-34a2-431b-a4f8-fbce65b234a6,0.149637
5270664,000487e0-55f9-46a8-a25a-4bb9c13e575e,20bbca98-05c1-4b39-8466-d42efbe918ae,0.131881
8205204,000487e0-55f9-46a8-a25a-4bb9c13e575e,325b9956-aad9-42c8-934a-c8d486aa42ca,0.118966
15760206,000487e0-55f9-46a8-a25a-4bb9c13e575e,61a30286-2d7a-41b1-85f2-b62ac95a5783,0.114797
32607918,000487e0-55f9-46a8-a25a-4bb9c13e575e,ca532145-5d12-46bc-a5e0-6eb07bb2a88f,0.107731
7670082,000487e0-55f9-46a8-a25a-4bb9c13e575e,2f5d769d-1c83-4cd6-930e-f44b06dcfe1d,0.107137
18286212,000487e0-55f9-46a8-a25a-4bb9c13e575e,7103f1bc-53f0-4bb3-9a5c-f19ea333cbb9,0.103006


In [120]:
# Reduce the List to a job (00012e5d-3b1a-43ab-8d1e-95518826fe32) and get the predicted rating
def get_single_job_pred(x):
    cf_preds_list_single = cf_preds_list[cf_preds_list['job']==x]
    cf_preds_list_single = cf_preds_list_single.sort_values(by=['predicted rating'],ascending=False)
    cf_preds_list_single = cf_preds_list_single[['job','applicant','predicted rating']]
    return cf_preds_list_single
cf_preds_list_single = get_single_job_pred("00012e5d-3b1a-43ab-8d1e-95518826fe32")
cf_preds_list_single.head(10)

Unnamed: 0,job,applicant,predicted rating
2268,00012e5d-3b1a-43ab-8d1e-95518826fe32,63039ee5-6848-4e3b-9a86-c7954a9c6e34,0.05038
430,00012e5d-3b1a-43ab-8d1e-95518826fe32,12fd2b58-f0b9-4745-89e9-186029e303f0,0.045091
5629,00012e5d-3b1a-43ab-8d1e-95518826fe32,fad71da6-184f-489e-ba14-ea4d5615ca7d,0.033936
1484,00012e5d-3b1a-43ab-8d1e-95518826fe32,3ff0f41a-1869-4385-8868-3554953cca83,0.033527
5729,00012e5d-3b1a-43ab-8d1e-95518826fe32,ff219002-c3ed-47a5-b858-7d77c32deae6,0.02873
1529,00012e5d-3b1a-43ab-8d1e-95518826fe32,41752b65-9b98-418b-90c6-4b6a2590f016,0.024132
2163,00012e5d-3b1a-43ab-8d1e-95518826fe32,5dc34e9b-f7dd-4b43-aa65-326b3dc2f71c,0.021336
416,00012e5d-3b1a-43ab-8d1e-95518826fe32,1266cc2f-dc75-4fb8-bda6-01195b53ff2a,0.021314
168,00012e5d-3b1a-43ab-8d1e-95518826fe32,07322344-d337-4b42-8b9e-6866cbb81c4a,0.021027
5007,00012e5d-3b1a-43ab-8d1e-95518826fe32,deddd21d-f724-492d-8e8b-e0167db00869,0.019622
