In [1]:
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
interactions = pd.read_csv("data_train_subset.csv")

In [3]:
interactions.head()

Unnamed: 0,users,jobs,event,timestamps
0,2531442,93409,click,1582436367
1,2038815,168399,click,1582597367
2,2678041,122067,click,1582407982
3,1073494,9017,click,1581885544
4,2727150,92678,bookmark,1582145466


# train

In [4]:
jobs = pd.Series(interactions.jobs).unique()
users = pd.Series(interactions.users).unique()
edge_list = list(zip(interactions['users'], interactions['jobs']))
unique_edges = list(set(edge_list))
len(unique_edges)

1000

In [5]:
def create_graph(jobs,users,unique_edges):
    G = nx.Graph()
    G.add_nodes_from(users, bipartite=0, label='users')
    G.add_nodes_from(jobs, bipartite=1, label='jobs')
    G.add_edges_from(unique_edges)
    return G

In [6]:
G=create_graph(jobs,users,unique_edges)

In [7]:
partition_0_nodes= users.tolist()
partition_0_node_count = len(partition_0_nodes)

partition_1_nodes= jobs.tolist()
partition_1_node_count = len(partition_1_nodes)

# Print the node counts
print("Partition 0 node count:", partition_0_node_count)
print("Partition 1 node count:", partition_1_node_count)

Partition 0 node count: 999
Partition 1 node count: 994


In [8]:
users_index = list(enumerate(partition_0_nodes, start=0))
jobs_index=list(enumerate(partition_1_nodes, start=0))

In [9]:
users_dict= dict(users_index)
users_dict= {value: key for key, value in users_dict.items()}
serial_to_job_dict= dict(jobs_index)
jobs_dict= {value: key for key, value in serial_to_job_dict.items()}

In [10]:
r_score = np.zeros((partition_1_node_count, partition_1_node_count))

In [11]:
def fill_matrix(node1,u_dash,node2,alpha,beta):
    p_idash_udash= 1/ abs(len(list(G.neighbors(node1))))**alpha
    p_udash_i= 1/ abs((abs(len(list(G.neighbors(u_dash))))**alpha) * (abs(len(list(G.neighbors(node2))))**beta) ) 
    return (p_idash_udash*p_udash_i)

In [12]:
def sim_score(node1,node2,alpha,beta):
    r_score=0;
    for u_dash in list(G.neighbors(node1)):
        if G.has_edge(u_dash, node2):
            r_score+=fill_matrix(node1,u_dash,node2,alpha,beta)
    return r_score

In [13]:
def item_sim_matrix(alpha,beta):
    for node1,x in jobs_dict.items():
        for node2,y in jobs_dict.items():
            r_score[x][y]+= sim_score(node1,node2,alpha,beta)
    print(r_score)

In [15]:
alpha= 0.61447198
beta=0.1443548
item_sim_matrix(alpha,beta)

[[2.        0.        0.        ... 0.        0.        0.       ]
 [0.        2.3639069 0.        ... 0.        0.        0.       ]
 [0.        0.        2.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 2.        0.        0.       ]
 [0.        0.        0.        ... 0.        2.        0.       ]
 [0.        0.        0.        ... 0.        0.        2.       ]]


In [16]:
def recommendations(job,r_score,n):
    job_idx= jobs_dict.get(job)
    user_scores = r_score[job_idx]
    sorted_scores_indices = np.argsort(-user_scores)
    top_n_recommendation_list = sorted_scores_indices[:n]
    top_n_recommendations = [serial_to_job_dict.get(x) for x in top_n_recommendation_list]
    return top_n_recommendations

In [17]:
def new_user_recommendations(n):
    # Calculate item popularity (counting the number of interactions)
    item_popularity = interactions['jobs'].value_counts().reset_index()
    item_popularity.columns = ['item', 'popularity']
    sorted_items = item_popularity.sort_values('popularity', ascending=False)
    top_n_recommendations = sorted_items['item'].head(n).tolist()
    return top_n_recommendations

In [18]:
recommendations(50672,r_score,10)

[50672, 93409, 172810, 100184, 75256, 128498, 139149, 20358, 57789, 51660]

In [19]:
# recommendations(2563823,r_score,10)
2563823 in users

False

In [20]:
def  realtime_recommendations(user,n):
    if not (user in users):
        key_list=list(new_user_recommendations(n))
        return key_list
    user_interactions = interactions[interactions['users'] == user]
    sorted_interactions = user_interactions.sort_values(by='timestamps', ascending=False)
    user_data = interactions[interactions['users'] == user]
    user_data= user_data['jobs'].nunique()
    if (user_data>=10):
        user_data=10
    most_recent_items = sorted_interactions.head(user_data)['jobs'].tolist()
    
    R={}
    for recent_item in most_recent_items:
        items_list=recommendations(recent_item,r_score,n)
        
        for similar_item in items_list:
            similarity_score= r_score[jobs_dict.get(recent_item)][jobs_dict.get(similar_item)]
            if similar_item in R:
                R[similar_item].append(similarity_score)
            else:
                R[similar_item] = [similarity_score]
            
    for similar_item, similarity_scores in R.items():
        R[similar_item] = sum(similarity_scores)
    
    sorted_R= sorted(R.items(), key=lambda x: x[1], reverse=True)
    top_n_items = dict(sorted_R[:n])
    
    key_list = list(top_n_items.keys())
    return key_list

In [21]:
realtime_recommendations(1073494,10)

[9017, 93409, 172810, 100184, 75256, 128498, 139149, 20358, 57789, 51660]

In [22]:
realtime_recommendations(2563823,10)

[168399, 115366, 146291, 54370, 50277, 33678, 50672, 139358, 17578, 136590]

# test

In [23]:
test_data = pd.read_csv("data_test_subset.csv")
test_data.head()

Unnamed: 0,users,jobs,event,timestamps
0,2563823,98021,click,1581994931
1,2448122,122656,click,1581526970
2,310204,107928,click,1581579765
3,1118737,120433,click,1582587666
4,427433,12303,click,1581531903


In [24]:
total_users = len(test_data['users'].unique())
precision_sum = 0
recall_sum = 0
total_users

80

In [25]:
  for user_id in test_data['users'].unique():
    true_positives = 0
    recommended_jobs = realtime_recommendations(user_id,50) 
    user_items = test_data.loc[test_data['users'] == user_id, 'jobs']
    
    for item in recommended_jobs:
        if item in user_items:
            true_positives += 1
    
    precision = true_positives / len(recommended_jobs)
    recall = true_positives / len(user_items)
    
    precision_sum += precision
    recall_sum += recall
    
# Calculate average precision and recall
average_precision = precision_sum / total_users
average_recall = recall_sum / total_users

print("Average Precision:", average_precision)
print("Average Recall:", average_recall)

Average Precision: 0.0
Average Recall: 0.0
