Install necessary libraries

In [None]:
!pip install implicit
import pandas as pd
import numpy as np
import time
import multiprocessing
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import normalize
import gdown

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


Load preprocessed event data

In [None]:
file_id = "1VowGO9LARBtqE5pec3QsaDdruhGTER-J"
output = "processed_events.csv"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

events = pd.read_csv(output)
print(events.head())

Downloading...
From (original): https://drive.google.com/uc?id=1VowGO9LARBtqE5pec3QsaDdruhGTER-J
From (redirected): https://drive.google.com/uc?id=1VowGO9LARBtqE5pec3QsaDdruhGTER-J&confirm=t&uuid=ba1352f1-e2c4-4bda-9753-dfe84b91b632
To: /content/processed_events.csv
100%|██████████| 335M/335M [00:05<00:00, 66.7MB/s]


                 timestamp  visitorid event  itemid  transactionid  \
0  2015-06-02 05:02:12.117     257597  view  355908            NaN   
1  2015-06-02 05:50:14.164     992329  view  248676            NaN   
2  2015-06-02 05:13:19.827     111016  view  318965            NaN   
3  2015-06-02 05:12:35.914     483717  view  253185            NaN   
4  2015-06-02 05:02:17.106     951259  view  367447            NaN   

   categoryid parent_categoryid  
0      1173.0             805.0  
1      1231.0             901.0  
2         NaN           Unknown  
3       914.0             226.0  
4      1613.0             250.0  


Load dataset efficiently

In [None]:
dtype_dict = {"visitorid": np.int32, "itemid": np.int32}
df = pd.read_csv("processed_events.csv", dtype=dtype_dict)

Prepare user and item mappings

Define & Map event weights, and drop any faulty events

In [None]:
event_weights = {'view': 1, 'addtocart': 4, 'transaction': 15}
df['event_weight'] = df['event'].map(event_weights)
df = df.dropna(subset=['event_weight'])

Reverse mapping for itemid recovery

In [None]:
unique_users = df['visitorid'].unique()
unique_items = df['itemid'].unique()
user_mapping = {user: i for i, user in enumerate(unique_users)}
item_mapping = {item: i for i, item in enumerate(unique_items)}
reverse_item_mapping = {i: item for item, i in item_mapping.items()}

print(f"Number of unique users: {len(unique_users)}")
print(f"Number of unique items: {len(unique_items)}")

Number of unique users: 1407580
Number of unique items: 235061


In [None]:
df['user_index'] = df['visitorid'].map(user_mapping)
df['item_index'] = df['itemid'].map(item_mapping)

Convert interactions to a memory-efficient sparse matrix

In [None]:
num_users, num_items = len(unique_users), len(unique_items)
interaction_matrix_sparse = csr_matrix(
    (np.ones(len(df), dtype=np.float32), (df['user_index'], df['item_index'])),
    shape=(num_users, num_items)
)

print(f"Shape of interaction matrix: {interaction_matrix_sparse.shape}")

Shape of interaction matrix: (1407580, 235061)


Train Alternating Least Squares (ALS) Model while increasing confidence

In [None]:
als_model = AlternatingLeastSquares(factors=50, iterations=10, regularization=0.1)
confidence_factor = 15
als_model.fit(interaction_matrix_sparse * confidence_factor)

  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

Train Nearest Neighbors (nbrs) model while using the normalize matrix

In [None]:
nbrs = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
interaction_matrix_normalized = normalize(interaction_matrix_sparse, norm='l2')
nbrs.fit(interaction_matrix_normalized)

Define (precesision & recall) as evaluation metrics

In [None]:
def precision_at_k(recommendations, actual_interactions, k=10):
    if len(recommendations) > k:
        recommendations = recommendations[:k]
    relevant_items = set(recommendations) & set(actual_interactions)
    return len(relevant_items) / k if k > 0 else 0

In [None]:
def recall_at_k(actual_interactions, recommendations, k=10):
    relevant_items = set(actual_interactions)
    recommended_items = set(recommendations[:k])
    if len(relevant_items) == 0:
        return 0
    return len(relevant_items & recommended_items) / len(relevant_items)

ALS Model Reccommendation funtion

In [None]:
def als_recommend(user_id):
    if user_id not in user_mapping:
        return []
    user_idx = user_mapping[user_id]
    user_factors = als_model.user_factors[user_idx]
    predicted_ratings = np.dot(user_factors, als_model.item_factors.T)
    valid_indices = np.argsort(predicted_ratings)[::-1][:10]
    return [reverse_item_mapping[i] for i in valid_indices if i in reverse_item_mapping]

nbrs Model recommendation function:

In [None]:
def nbrs_recommend(user_id):
    if user_id not in user_mapping:
        return []
    user_idx = user_mapping[user_id]
    distances, indices = nbrs.kneighbors(interaction_matrix_sparse[user_idx], n_neighbors=5)
    similar_users = indices.flatten()[1:]
    recommended_items = interaction_matrix_sparse[similar_users].sum(axis=0).A1.argsort()[-10:][::-1]
    return [int(reverse_item_mapping[i]) for i in recommended_items if i in reverse_item_mapping]

Optimize lookup for actual user interactions

In [None]:
actual_interactions_grouped = {}
for user, item in zip(df['visitorid'], df['itemid']):
    actual_interactions_grouped.setdefault(user, set()).add(item)
print(f"Number of unique users with actual interactions: {len(actual_interactions_grouped)}")

Number of unique users with actual interactions: 1407580


Select a smaller sample of users for faster evaluation

In [None]:
sample_users = np.random.choice(
    list(actual_interactions_grouped.keys()),
    size=min(1000, len(actual_interactions_grouped)),
    replace=False
)

print(f"Number of sampled users: {len(sample_users)}")

Number of sampled users: 1000


function to make recommendations and measure precesion and recall for them

In [None]:
def process_user(user_recommendation_tuple):
    user, recommend_func = user_recommendation_tuple
    try:
        recommended_items = recommend_func(user)
        actual_items = actual_interactions_grouped.get(user, set())
        if not recommended_items:
            return None
        precision = precision_at_k(recommended_items, actual_items, k=10)
        recall = recall_at_k(actual_items, recommended_items, k=10)
        return precision, recall
    except Exception as e:
        return None

Use multiprocessing for faster evaluation

In [None]:
def evaluate_model(model_name, recommend_func):
    print(f"Evaluating model: {model_name}")
    with multiprocessing.Pool(processes=4) as pool:
        results = pool.map(process_user, [(user, recommend_func) for user in sample_users])

    precision_scores = [res[0] for res in results if res is not None]
    recall_scores = [res[1] for res in results if res is not None]
    avg_precision = sum(precision_scores) / len(precision_scores) if precision_scores else 0
    avg_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0

    return {"Model": model_name, "Precision@10": avg_precision, "Recall@10": avg_recall}

Start evaluation

In [None]:
evaluation_results = []
models = {
    "ALS": als_recommend,
    "nbrs": nbrs_recommend
}
print("Starting evaluation...")
evaluation_results = [evaluate_model(model_name, recommend_func) for model_name, recommend_func in models.items()]

Starting evaluation...
Evaluating model: ALS
Evaluating model: nbrs


Convert results to DataFrame and display

In [None]:
results_df = pd.DataFrame(evaluation_results)
print("Final Evaluation Results:")
print(results_df)

Final Evaluation Results:
  Model  Precision@10  Recall@10
0   ALS        0.0185   0.119730
1  nbrs        0.1158   0.876148


#Based on the results:
**ALS** (Precision 0.019, Recall 0.106):

* High false positives (because almost all recommendations are irrelevant).
* High false negatives (because it's missing almost all relevant items).

**nbrs**(Efficient) (Precision 0.116, Recall 0.890):
* Lower false positives (more accurate recommendations).
* Very low false negatives (capturing most of the relevant items).


# Example Usage

Example User ID


In [None]:
user_id = 121688
recommended_items = nbrs_recommend(user_id)
print(recommended_items)

[380775, 27090, 238865, 302943, 281164, 408846, 358385, 433504, 368193, 282778]


# Save preprocessed user data

In [None]:
#iltered_df = df[['visitorid', 'itemid', 'event', 'user_index', 'item_index', 'event_weight']]
#filtered_df.to_csv("filtered_events_updated.csv", index=False)