<a href="https://colab.research.google.com/github/Haaseth-Abhiram/E-commerce-Recommendation-Engine/blob/main/E_commerce_Recommendation_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#E-commerce Recommendation Engine
User-based and Item-based Collaborative Filtering using Surprise

In [None]:
!pip uninstall -y numpy scikit-surprise
!pip install numpy==1.26.4
!pip install scikit-surprise

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
[0mCollecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)


In [None]:
!pip install scikit-surprise

import pandas as pd
import numpy as np

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from collections import defaultdict


Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2555930 sha256=3b10d4f948b82bb25ef4b6e190d6b7d410d400014ae4d3fb80b132770efd3aff
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [None]:
df = pd.read_csv("purchase_data.csv")
df.head()


Unnamed: 0,date,customer_id,product_category,payment_method,value [USD],time_on_site [Minutes],clicks_in_site,Unnamed: 7
0,20/11/2018,37077,505,credit,49.53,12.0,8,
1,20/11/2018,59173,509,paypal,50.61,25.9,8,
2,20/11/2018,41066,507,credit,85.99,34.9,11,
3,20/11/2018,50741,506,credit,34.6,16.5,9,
4,20/11/2018,53639,515,paypal,266.27,43.1,30,


In [None]:
# Remove users with very few interactions
user_counts = df['customer_id'].value_counts()
df = df[df['customer_id'].isin(user_counts[user_counts >= 2].index)]

# Remove items with very few interactions
item_counts = df['product_category'].value_counts()
df = df[df['product_category'].isin(item_counts[item_counts >= 2].index)]

print("Cleaned data shape:", df.shape)

Cleaned data shape: (9701, 8)


In [None]:
reader = Reader(rating_scale=(df['value [USD]'].min(), df['value [USD]'].max()))

data = Dataset.load_from_df(
    df[['customer_id', 'product_category', 'value [USD]']],
    reader
)


In [None]:
trainset, testset = train_test_split(
    data,
    test_size=0.2,
    random_state=42
)


In [None]:
user_cf = KNNBasic(
    sim_options={
        'name': 'cosine',
        'user_based': True
    }
)

user_cf.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x789572089370>

In [None]:
item_cf = KNNBasic(
    sim_options={
        'name': 'cosine',
        'user_based': False
    }
)

item_cf.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x789571d1c7a0>

In [None]:
def precision_recall_at_k(predictions, k=5, threshold=3.5):
    user_est_true = defaultdict(list)

    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = {}
    recalls = {}

    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum(true_r >= threshold for (_, true_r) in ratings)
        n_rec_k = sum(est >= threshold for (est, _) in ratings[:k])
        n_rel_and_rec_k = sum(
            (true_r >= threshold and est >= threshold)
            for (est, true_r) in ratings[:k]
        )

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel else 0

    return (
        sum(precisions.values()) / len(precisions),
        sum(recalls.values()) / len(recalls)
    )


In [None]:
user_preds = user_cf.test(testset)
item_preds = item_cf.test(testset)

user_precision, user_recall = precision_recall_at_k(user_preds, k=5)
item_precision, item_recall = precision_recall_at_k(item_preds, k=5)

print("User-CF Precision@5:", user_precision)
print("User-CF Recall@5:", user_recall)

print("Item-CF Precision@5:", item_precision)
print("Item-CF Recall@5:", item_recall)


User-CF Precision@5: 0.9816004672897196
User-CF Recall@5: 0.9836448598130841
Item-CF Precision@5: 0.9611565420560748
Item-CF Recall@5: 0.9632009345794392


In [None]:
def get_top_n(model, user_id, n=5):
    items = df['product_category'].unique()
    rated_items = df[df['customer_id'] == user_id]['product_category'].values

    predictions = []
    for item in items:
        if item not in rated_items:
            est = model.predict(user_id, item).est
            predictions.append((item, est))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]

In [None]:
sample_user = df['customer_id'].iloc[0]

print("Top-5 (User-CF):", get_top_n(user_cf, sample_user))
print("Top-5 (Item-CF):", get_top_n(item_cf, sample_user))

Top-5 (User-CF): [(509, 250.9705), (508, 242.99724999999998), (506, 241.18625000000003), (514, 233.50125000000003), (504, 215.56825000000003)]
Top-5 (Item-CF): [(506, 140.15926763059923), (507, 138.47074517121317), (504, 135.4297002853697), (509, 134.0171360777906), (512, 133.54482238434363)]
