In [2]:
import pandas as pd

In [3]:
# Load datasets
# event might be 'view', 'transaction'. 'addtocart'
events = pd.read_csv('./input/events.csv')
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
# Now, we want to convert the datasets to the following format
# data = {
#     'userId': [1, 1, 1, 2, 2, 3],
#     'productId': ['A', 'B', 'C', 'A', 'B', 'C'],
#     'view_count': [5, 3, 7, 2, 8, 6],
# }

# filter events first for only view transaction, so that later to calculate view_count
events = events[events['event'].isin(['view'])]

# Use group by to count occurrences of each event type for each visitor and item
events_grouped = events.groupby(['visitorid', 'itemid', 'event']).size().unstack(fill_value=0).reset_index()

# Rename columns
events_grouped.columns = ['visitor_id', 'item_id', 'view_count']
top_1000_records = events_grouped.sort_values(by='view_count', ascending=False).head(10000)
events_grouped = top_1000_records
print(events_grouped)


         visitor_id  item_id  view_count
591991       388556   306289         308
1930131     1272794   413901         233
2075538     1369328   356339         212
1238811      816229   396064         180
1782370     1172198   414460         176
...             ...      ...         ...
577256       378812   328428           6
2042380     1347019   343731           6
637523       418408   235281           6
468693       308826   459936           6
1326555      873793   457717           6

[10000 rows x 3 columns]


In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
# Creating the user-item matrix
user_item_matrix = events_grouped.pivot_table(index='visitor_id', columns='item_id', values='view_count', fill_value=0)

# Calculating cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Function to recommend items based on similar users
def recommend_items_for_user(visitor_id, top_n=3):
    # Similar users
    similar_users = user_similarity_df[visitor_id].sort_values(ascending=False).index[1:]

    # Aggregating views from similar users
    similar_users_views = user_item_matrix.loc[similar_users].sum().sort_values(ascending=False)

    # Exclude items already viewed by the user
    viewed_items = user_item_matrix.loc[visitor_id]
    recommendations = similar_users_views[~similar_users_views.index.isin(viewed_items[viewed_items > 0].index)]

    return recommendations.head(top_n).index.tolist()

In [6]:
user_id = 816229
recommending = recommend_items_for_user(user_id, 10)
print(recommending)

[119736, 461686, 40870, 9877, 306289, 325310, 369447, 413901, 356339, 241555]
