# 🛒 Retail & E-commerce Recommendation Engine
#### (Now includes Section 4.3: Model Performance Comparison Table)

This notebook builds baseline and session-based recommendation models, then compares their performance as per the project requirements.


In [1]:
# Imports and setup

import sys
!{sys.executable} -m pip install pandas numpy matplotlib scikit-learn torch pyspark kagglehub
import pandas as pd
import kagglehub
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Admins\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


## 1. Data Loading & Cleaning

In [2]:
# Load your data (adjust paths)
path = kagglehub.dataset_download("retailrocket/ecommerce-dataset")
events = pd.read_csv(path+'/events.csv')
# events.csv columns: timestamp, visitorid, event, itemid, transactionid

# Remove duplicates
events = events.drop_duplicates()

# Timestamp to datetime
events['datetime'] = pd.to_datetime(events['timestamp'], unit='ms')

# Remove bots (users with >200 events in any hour)
user_hour = events.groupby(['visitorid', pd.Grouper(key='datetime', freq='H')]).size().reset_index(name='event_count')
bots = user_hour[user_hour['event_count'] > 200]['visitorid'].unique()
events = events[~events['visitorid'].isin(bots)]

# Filter users/items with >=5 interactions
active_users = events['visitorid'].value_counts()[lambda x: x >= 5].index
active_items = events['itemid'].value_counts()[lambda x: x >= 5].index
events = events[events['visitorid'].isin(active_users) & events['itemid'].isin(active_items)]

Resuming download from 132120576 bytes (172599398 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/retailrocket/ecommerce-dataset?dataset_version_number=2 (132120576/304719974) bytes left.


100%|██████████| 291M/291M [00:16<00:00, 10.7MB/s]

Extracting files...



  user_hour = events.groupby(['visitorid', pd.Grouper(key='datetime', freq='H')]).size().reset_index(name='event_count')


## 2. Feature Engineering

In [3]:
# Session split: new session if gap > 30min for a user
events = events.sort_values(['visitorid', 'datetime'])
events['session_id'] = (events.groupby('visitorid')['datetime']
    .diff().gt(timedelta(minutes=30)).cumsum().astype(int))
events['session_id'] = events['visitorid'].astype(str) + '_' + events['session_id'].astype(str)

# Event encoding
event_map = {'view': 1, 'addtocart': 3, 'transaction': 5}
events['event_weight'] = events['event'].map(event_map)

# Encode user/item
user_enc = LabelEncoder()
item_enc = LabelEncoder()
events['user_idx'] = user_enc.fit_transform(events['visitorid'])
events['item_idx'] = item_enc.fit_transform(events['itemid'])

## 3. Data Split (Temporal)

In [4]:
max_date = events['datetime'].max()
train_cutoff = max_date - timedelta(weeks=2)
val_cutoff = max_date - timedelta(weeks=1)
train = events[events['datetime'] < train_cutoff]
valid = events[(events['datetime'] >= train_cutoff) & (events['datetime'] < val_cutoff)]
test = events[events['datetime'] >= val_cutoff]

## 4. Baseline Model: Most Popular Items

In [5]:
most_popular = train['item_idx'].value_counts().index[:20].tolist()

## 5. GRU4Rec Session-based Model (PyTorch, demo)

In [6]:
# Build session sequences
seq_df = (train.groupby('session_id')['item_idx'].apply(list).reset_index(name='seq'))
seq_df = seq_df[seq_df['seq'].apply(lambda x: len(x) > 1)]

class SessionDataset(Dataset):
    def __init__(self, seqs, maxlen=10):
        self.samples = []
        for seq in seqs:
            for i in range(1, len(seq)):
                start = max(0, i - maxlen)
                self.samples.append((seq[start:i], seq[i]))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        seq, label = self.samples[idx]
        x = np.zeros(10, dtype=int)
        x[-len(seq):] = seq[-10:]
        return torch.LongTensor(x), torch.LongTensor([label])

class GRU4Rec(nn.Module):
    def __init__(self, n_items, emb_dim=50, hid_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(n_items, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim, n_items)
    def forward(self, x):
        emb = self.embedding(x)
        _, h = self.gru(emb)
        out = self.fc(h.squeeze(0))
        return out

# Prepare PyTorch data
sequences = seq_df['seq'].tolist()
dataset = SessionDataset(sequences)
loader = DataLoader(dataset, batch_size=256, shuffle=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRU4Rec(n_items=events['item_idx'].nunique()).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Training loop (demo)
for epoch in range(2):
    model.train()
    epoch_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device).squeeze()
        optimizer.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(loader):.4f}")

Epoch 1, Loss: 9.4879
Epoch 2, Loss: 7.2098


## 6. Section 4.3: Model Performance Comparison Table
We compare Most Popular and GRU4Rec using Precision@20, Recall@20, F1@20, and NDCG@20 on test sessions.

In [7]:
# --- Metric functions ---
def precision_at_k(y_true, y_pred, k=20):
    hits = 0
    total = 0
    for pred, true in zip(y_pred, y_true):
        if true in pred[:k]:
            hits += 1
        total += 1
    return hits / total if total > 0 else 0

def recall_at_k(y_true, y_pred, k=20):
    return precision_at_k(y_true, y_pred, k)  # for next-item prediction, recall==precision

def f1_at_k(y_true, y_pred, k=20):
    p = precision_at_k(y_true, y_pred, k)
    r = recall_at_k(y_true, y_pred, k)
    return 2 * p * r / (p + r + 1e-10)

def ndcg_at_k(y_true, y_pred, k=20):
    ndcg = 0
    for pred, true in zip(y_pred, y_true):
        if true in pred[:k]:
            idx = pred[:k].index(true)
            ndcg += 1 / np.log2(idx + 2)
    return ndcg / len(y_true) if len(y_true) > 0 else 0

In [8]:
# --- Prepare test sessions for evaluation ---
test_sessions = [group['item_idx'].tolist() for _, group in test.groupby('session_id') if len(group) > 1]
y_true = [items[-1] for items in test_sessions]

# Most Popular prediction: same for all
y_pred_pop = [most_popular for _ in test_sessions]

In [9]:
# GRU4Rec predictions
def predict_next(model, session, k=20):
    model.eval()
    with torch.no_grad():
        inp = torch.LongTensor([session[-10:]]).to(device)
        logits = model(inp)
        topk = logits.cpu().numpy().argsort()[0][-k:][::-1]
    return topk.tolist()

y_pred_gru = []
for items in test_sessions:
    pred = predict_next(model, items[:-1], k=20)
    y_pred_gru.append(pred)

In [10]:
# --- 4.3 Table ---
results = []
results.append([
    'Most Popular',
    precision_at_k(y_true, y_pred_pop, 20),
    recall_at_k(y_true, y_pred_pop, 20),
    f1_at_k(y_true, y_pred_pop, 20),
    ndcg_at_k(y_true, y_pred_pop, 20)
])
results.append([
    'GRU4Rec',
    precision_at_k(y_true, y_pred_gru, 20),
    recall_at_k(y_true, y_pred_gru, 20),
    f1_at_k(y_true, y_pred_gru, 20),
    ndcg_at_k(y_true, y_pred_gru, 20)
])

results_df = pd.DataFrame(results, columns=['Model', 'Precision@20', 'Recall@20', 'F1@20', 'NDCG@20'])
display(results_df)

Unnamed: 0,Model,Precision@20,Recall@20,F1@20,NDCG@20
0,Most Popular,0.017294,0.017294,0.017294,0.008429
1,GRU4Rec,0.255544,0.255544,0.255544,0.191835


### 📊 The above table can be used to fill in your Section 4.3 Model Performance Comparison report.

*Add more models (ALS, LightGBM, etc.) by extending the table.*