In [1]:
# This function is inspired by: https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101

In [2]:
# Import Libraries
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [3]:
# Import Job View
job_view_df = pd.read_csv('job_view.csv')
job_view_df.head(5)

Unnamed: 0,applicant,job
0,856312b2-78ea-4788-a814-412d5b6c76fe,b30044e3-fb9c-40ab-b7d1-c6df50251a64
1,7d6e8650-ab4c-4d5a-ba17-2d3c003b74f8,c721a742-1f75-42c2-9038-8fe380fe9e33
2,798e21d9-dc5a-44d3-93b5-517c83e4ad5d,7565155d-78a0-4168-92da-e3f685e52cb8
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685
4,0d7dc2eb-848f-4824-90fd-e0b8ede397bb,ff936b5c-47f8-4e19-b762-607e377f9fa9


In [4]:
# Import Job Application
job_application_df = pd.read_csv('job_application.csv')
job_application_df.head(5)

Unnamed: 0,applicant,job
0,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685
1,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9ccf8308-260e-4e87-abd1-cac5da434a98
2,a57353e8-e85b-498c-8d60-d1d9a485e1b0,6c8dd9d9-0e23-43de-b89a-f676646c594b
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,99eb4ff0-400e-4e65-ba60-6b8f39145870
4,a57353e8-e85b-498c-8d60-d1d9a485e1b0,c87ea47d-2ebc-4d0f-8014-c2de9ed1e850


In [5]:
# Define event in each list and combine
job_view_df['eventType'] = 'view'
job_application_df['eventType'] = 'application'
interactions_df = job_view_df.append(job_application_df)
interactions_df.head(5)

Unnamed: 0,applicant,job,eventType
0,856312b2-78ea-4788-a814-412d5b6c76fe,b30044e3-fb9c-40ab-b7d1-c6df50251a64,view
1,7d6e8650-ab4c-4d5a-ba17-2d3c003b74f8,c721a742-1f75-42c2-9038-8fe380fe9e33,view
2,798e21d9-dc5a-44d3-93b5-517c83e4ad5d,7565155d-78a0-4168-92da-e3f685e52cb8,view
3,a57353e8-e85b-498c-8d60-d1d9a485e1b0,9e0dd838-3bef-440c-ad88-8d25da69d685,view
4,0d7dc2eb-848f-4824-90fd-e0b8ede397bb,ff936b5c-47f8-4e19-b762-607e377f9fa9,view


In [6]:
# Introduce Event Weight
event_type_strength = {
   'view': 1.0,
   'application': 2.0, 
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [7]:
# Define Smooth User Experience
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_df \
                    .groupby(['applicant', 'job'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 60791


Unnamed: 0,applicant,job,eventStrength
0,000487e0-55f9-46a8-a25a-4bb9c13e575e,20bbca98-05c1-4b39-8466-d42efbe918ae,1.0
1,000487e0-55f9-46a8-a25a-4bb9c13e575e,325b9956-aad9-42c8-934a-c8d486aa42ca,1.0
2,000487e0-55f9-46a8-a25a-4bb9c13e575e,d81da1e6-34a2-431b-a4f8-fbce65b234a6,1.0
3,000487e0-55f9-46a8-a25a-4bb9c13e575e,f00a7243-15a4-4995-b30a-cb915b8c707a,1.0
4,00471d37-f357-412c-a06f-a1ee1dde7571,10bc4bae-0155-4ff8-95ff-abb29a14163e,1.0
5,00471d37-f357-412c-a06f-a1ee1dde7571,68228002-fd59-468d-8e23-d2bc6b63bd63,1.0
6,00471d37-f357-412c-a06f-a1ee1dde7571,87946b49-f05d-463b-a25b-e02c249054cd,1.0
7,00471d37-f357-412c-a06f-a1ee1dde7571,c36ed6cc-637f-4e99-9101-b3665e60ef8b,1.0
8,00471d37-f357-412c-a06f-a1ee1dde7571,fca8fdca-b180-44cf-8bad-da045885214c,1.0
9,0049339a-4620-4db6-b7b9-063abdfb5b87,2875a7b0-3c51-46d3-8192-0ecd51a5e94f,1.0


In [8]:
# Check the distribution of eventStrength
count = interactions_full_df.groupby(['eventStrength']).size() 
print(count)

eventStrength
1.000000    57791
1.584963      791
2.000000     2209
dtype: int64


In [9]:
# Model Evaluation
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,test_size=0.2, random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 48632
# interactions on Test set: 12159


In [11]:
## Top-N Accuracy ##
# Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('applicant')
interactions_train_indexed_df = interactions_train_df.set_index('applicant')
interactions_test_indexed_df = interactions_test_df.set_index('applicant')

In [12]:
def get_items_interacted(applicant, interactions_df):
    # Get the user's data and merge in the job.
    interacted_items = interactions_df.loc[applicant]['job']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [37]:
## Collaborative Filtering ##

# Matrix Factorization

# Creating a sparse pivot table with users in rows and items in columns
applicant_job_pivot_matrix_df = interactions_train_df.pivot(index='applicant', 
                                                          columns='job', 
                                                          values='eventStrength').fillna(0)

applicant_job_pivot_matrix_df.shape

(5531, 6798)

In [38]:
applicant_job_pivot_matrix = applicant_job_pivot_matrix_df.values
applicant_job_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
applicant_ids = list(applicant_job_pivot_matrix_df.index)
applicant_ids[:10]

['000487e0-55f9-46a8-a25a-4bb9c13e575e',
 '00471d37-f357-412c-a06f-a1ee1dde7571',
 '0049339a-4620-4db6-b7b9-063abdfb5b87',
 '00508393-df8e-416e-8d94-07c4201bef73',
 '005a16c1-6bd7-4027-90a9-03b9194d7cb7',
 '00615048-808f-4ca1-8e0f-12de3d125556',
 '00618bcb-a5fd-474c-9abd-76beb0b88482',
 '006286eb-3027-423b-b560-d41bad44d623',
 '00683440-8d42-40a2-a85e-3934fc423983',
 '007d618c-361c-4693-a42f-06daffa5c779']

In [40]:
job_ids = list(applicant_job_pivot_matrix_df.columns)
job_ids[:10]

['00012e5d-3b1a-43ab-8d1e-95518826fe32',
 '0003c633-23ef-4619-abd8-d12792273271',
 '0008d561-e5e2-483c-bc5a-f8fe59362346',
 '000ecf1e-c0f3-42c8-beb1-e245f837f03e',
 '00279eab-8ebd-4173-bce5-4ce484e60c02',
 '003485f8-c770-402a-8a63-8ee68a41412e',
 '003557ea-1870-4062-8211-6aae9b0a8815',
 '004616f3-1b65-4393-a45a-97a2f92c9ee7',
 '006c44a8-7abb-4724-8f5e-f851bba8c3ce',
 '006fe145-4a37-4b81-8899-3fc1b3e6cf61']

In [41]:
#The number of factors to factor the applicant-job matrix.
NUMBER_OF_FACTORS_MF = 240
#Performs matrix factorization of the original applicant-job matrix
U, sigma, Vt = svds(applicant_job_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [35]:
U.shape

(5531, 240)

In [20]:
Vt.shape

(240, 6798)

In [21]:
sigma = np.diag(sigma)
sigma.shape

(240, 240)

In [22]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-1.83962528e-04,  4.84382291e-03,  3.86597158e-03, ...,
        -2.00114781e-03, -9.77894907e-03, -6.01437908e-03],
       [ 3.42850903e-04, -3.35913865e-03, -7.59034747e-04, ...,
         4.62796959e-03,  3.13551232e-03,  6.15640645e-03],
       [ 1.55627523e-03, -8.96185918e-03, -1.43004237e-02, ...,
         1.88338449e-02, -1.23257728e-02,  2.50089539e-03],
       ...,
       [-3.21348002e-04, -1.07243505e-02, -1.15527716e-03, ...,
        -4.44681962e-03,  7.08869907e-03, -5.71578317e-04],
       [-1.68024235e-04, -1.42497989e-02, -4.88927944e-03, ...,
         3.56950491e-03,  1.17701958e-03,  1.02886419e-03],
       [-5.06363532e-04,  9.65282460e-01, -7.33419652e-04, ...,
        -5.08779697e-03,  1.60749958e-03, -1.36394115e-02]])

In [23]:
# Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = job_ids, index = applicant_ids)
cf_preds_df.head(10)

Unnamed: 0,00012e5d-3b1a-43ab-8d1e-95518826fe32,0003c633-23ef-4619-abd8-d12792273271,0008d561-e5e2-483c-bc5a-f8fe59362346,000ecf1e-c0f3-42c8-beb1-e245f837f03e,00279eab-8ebd-4173-bce5-4ce484e60c02,003485f8-c770-402a-8a63-8ee68a41412e,003557ea-1870-4062-8211-6aae9b0a8815,004616f3-1b65-4393-a45a-97a2f92c9ee7,006c44a8-7abb-4724-8f5e-f851bba8c3ce,006fe145-4a37-4b81-8899-3fc1b3e6cf61,...,ff982f10-9989-4dcd-a855-415a9fd290ad,ffa40787-8baa-4609-9dec-0aa578cb26f6,ffc5dfef-82c1-44d6-b6bf-cd53b7d0d12c,ffc6b005-9295-4579-b7b2-ef7d5ce3ad5f,ffc9c5d3-85f0-4f2b-a861-c2622a539bd9,ffd3f67e-86b7-4b3a-8861-74b1e315a978,ffde7b9b-0a19-40f0-ac72-823b38073d5d,ffe5e5ec-f16b-4d24-a2c0-133203d46ab5,fff06470-353a-4db2-a1d3-4b70351a37b6,fff302aa-68ad-464f-88d7-83d8ca4ed639
000487e0-55f9-46a8-a25a-4bb9c13e575e,-0.000184,0.004844,0.003866,-0.001255,0.011126,-0.000519,0.003334,-0.001425,-0.003753,0.007135,...,-0.0019,-0.005738,-0.007514,0.001228,0.00025,-0.000952,-0.003103,-0.002001,-0.009779,-0.006014
00471d37-f357-412c-a06f-a1ee1dde7571,0.000343,-0.003359,-0.000759,-5.7e-05,-0.001465,-0.000534,-0.000246,0.000932,-0.000867,0.008604,...,0.000964,-0.000149,-0.001708,-0.004411,0.013498,-0.000144,-0.000646,0.004628,0.003136,0.006156
0049339a-4620-4db6-b7b9-063abdfb5b87,0.001556,-0.008962,-0.0143,0.002335,-0.000318,-0.004355,0.003961,-0.005346,-0.004033,-0.002729,...,-0.005627,-0.01049,0.004622,-0.007047,-0.00969,0.004295,0.000258,0.018834,-0.012326,0.002501
00508393-df8e-416e-8d94-07c4201bef73,-0.005005,0.01755,0.010238,0.001211,-0.003676,0.004202,-0.01186,0.000243,0.002818,0.004415,...,-0.009956,0.022978,-0.002193,-0.015454,0.004518,-0.00216,0.001859,-0.007137,0.000727,-0.005371
005a16c1-6bd7-4027-90a9-03b9194d7cb7,0.000332,-0.00059,-0.001351,0.000117,-0.002361,0.001029,0.00441,-0.001908,0.000999,-0.000673,...,-0.000175,0.000293,-0.00065,-0.001143,-0.000556,-0.000763,-0.004628,-0.003471,-0.000847,-0.001117
00615048-808f-4ca1-8e0f-12de3d125556,-0.004228,-0.014011,0.00049,0.000158,-0.005321,-0.00035,-0.017102,0.010635,0.002173,0.007791,...,-0.002683,0.003822,-0.004035,-0.000831,-0.011321,-0.003572,-0.009486,0.021686,-0.005001,0.005577
00618bcb-a5fd-474c-9abd-76beb0b88482,-0.003543,-0.002801,-0.008189,0.000527,0.009981,-0.001053,-0.015164,-0.005904,0.006431,-3.3e-05,...,0.017167,0.020251,0.004559,0.000622,-0.034357,-0.003276,0.002083,-0.007914,0.084397,-0.008183
006286eb-3027-423b-b560-d41bad44d623,0.000102,-0.001022,-0.000427,-0.000119,-0.001201,0.000367,0.003195,-0.001998,0.001752,0.000566,...,-0.001385,-0.000403,0.000305,-0.001435,0.000447,-0.001542,-0.003679,0.005163,-0.002019,-0.000582
00683440-8d42-40a2-a85e-3934fc423983,-0.000281,-0.019674,0.001354,-0.000595,8.4e-05,0.001518,0.00159,0.001572,0.002411,0.002511,...,-0.002013,0.010549,0.000517,-0.00457,-0.002023,-0.000405,0.000495,-0.004335,-0.005817,0.0006
007d618c-361c-4693-a42f-06daffa5c779,0.000579,0.035497,0.054859,0.002125,-0.007606,-0.001256,-0.002344,-0.00394,0.000362,7.3e-05,...,-0.021686,-0.005104,0.005926,-0.001781,-0.026333,0.000405,-0.007022,0.015277,-0.057318,0.008071


In [24]:
# Reshape the Matrix to List
cf_preds_list = cf_preds_df.reset_index()
cf_preds_list = cf_preds_list.melt(id_vars="index",var_name="job", value_name="predicted rating")
cf_preds_list.columns = ['applicant','job','predicted rating']

In [48]:
# Reduce the List to a single applicant (000487e0-55f9-46a8-a25a-4bb9c13e575e) and get the predicted rating
def get_single_applicant_pred(x):
    cf_preds_list_single = cf_preds_list[cf_preds_list['applicant']==x]
    cf_preds_list_single = cf_preds_list_single.sort_values(by=['predicted rating'],ascending=False)
    return cf_preds_list_single
cf_preds_list_single = get_single_applicant_pred("000487e0-55f9-46a8-a25a-4bb9c13e575e")
cf_preds_list_single.head(10)

Unnamed: 0,applicant,job,predicted rating
35332028,000487e0-55f9-46a8-a25a-4bb9c13e575e,f00a7243-15a4-4995-b30a-cb915b8c707a,0.42764
7444726,000487e0-55f9-46a8-a25a-4bb9c13e575e,325b9956-aad9-42c8-934a-c8d486aa42ca,0.325707
31764533,000487e0-55f9-46a8-a25a-4bb9c13e575e,d81da1e6-34a2-431b-a4f8-fbce65b234a6,0.306029
4778784,000487e0-55f9-46a8-a25a-4bb9c13e575e,20bbca98-05c1-4b39-8466-d42efbe918ae,0.23335
34878486,000487e0-55f9-46a8-a25a-4bb9c13e575e,ed967b82-87c2-499e-8af0-ef43e68a0f3f,0.17877
586286,000487e0-55f9-46a8-a25a-4bb9c13e575e,04d77c5f-19c7-427e-bc95-fd7dcb996e09,0.178227
29607443,000487e0-55f9-46a8-a25a-4bb9c13e575e,ca532145-5d12-46bc-a5e0-6eb07bb2a88f,0.128559
22704755,000487e0-55f9-46a8-a25a-4bb9c13e575e,9db97e64-58d2-40f5-a05b-4855f83cdde3,0.108221
23705866,000487e0-55f9-46a8-a25a-4bb9c13e575e,a472254e-1385-48e1-819b-f42b53a464cd,0.107949
31637320,000487e0-55f9-46a8-a25a-4bb9c13e575e,d71b3612-57f4-410a-b50f-a961a1508ed4,0.097253


In [49]:
# Reduce the List to a single job (00012e5d-3b1a-43ab-8d1e-95518826fe32) and get the predicted rating
def get_single_job_pred(x):
    cf_preds_list_single = cf_preds_list[cf_preds_list['job']==x]
    cf_preds_list_single = cf_preds_list_single.sort_values(by=['predicted rating'],ascending=False)
    cf_preds_list_single = cf_preds_list_single[['job','applicant','predicted rating']]
    return cf_preds_list_single
cf_preds_list_single = get_single_job_pred("00012e5d-3b1a-43ab-8d1e-95518826fe32")
cf_preds_list_single.head(10)

Unnamed: 0,job,applicant,predicted rating
413,00012e5d-3b1a-43ab-8d1e-95518826fe32,12fd2b58-f0b9-4745-89e9-186029e303f0,0.06763
2184,00012e5d-3b1a-43ab-8d1e-95518826fe32,63039ee5-6848-4e3b-9a86-c7954a9c6e34,0.057358
165,00012e5d-3b1a-43ab-8d1e-95518826fe32,07322344-d337-4b42-8b9e-6866cbb81c4a,0.046674
5409,00012e5d-3b1a-43ab-8d1e-95518826fe32,fad71da6-184f-489e-ba14-ea4d5615ca7d,0.043009
2082,00012e5d-3b1a-43ab-8d1e-95518826fe32,5dc34e9b-f7dd-4b43-aa65-326b3dc2f71c,0.039096
1426,00012e5d-3b1a-43ab-8d1e-95518826fe32,3ff0f41a-1869-4385-8868-3554953cca83,0.030611
920,00012e5d-3b1a-43ab-8d1e-95518826fe32,2953d6a6-dc38-4e44-b14e-d57cf4a1228c,0.030518
3126,00012e5d-3b1a-43ab-8d1e-95518826fe32,8e777947-ed9b-4958-bed4-f91f921fdc51,0.026047
4663,00012e5d-3b1a-43ab-8d1e-95518826fe32,d81a9570-3cd2-4fe0-9807-ec77f9ab2ddd,0.025026
1469,00012e5d-3b1a-43ab-8d1e-95518826fe32,41752b65-9b98-418b-90c6-4b6a2590f016,0.024627
