In [1]:
# ====================================
# Notebook 8.1: BERT4Rec + Clustering (DBSCAN & HDBSCAN)
# Description:
# This notebook trains a BERT4Rec model on order sequences and evaluates it
# using DBSCAN and HDBSCAN cluster-based attention re-ranking.
# Evaluated using nDCG@20 and Recall@20.
# ====================================

In [2]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 147 (delta 50), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (147/147), 201.81 MiB | 22.17 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [26]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m141.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m171.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.5/768.5 MB[0m [31m397.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m122.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# === Install dependencies ===
!pip install -q torch transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os
import json
import pickle
import zipfile
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [6]:
# === Set paths ===
processed_data = os.path.join(repo_dir, "data", "processed")
interim_data = os.path.join(repo_dir, "data", "interim")

train_zip_path = os.path.join(processed_data, "train_df.zip")
test_zip_path = os.path.join(processed_data, "test_df.zip")
cluster_zip_path = os.path.join(interim_data, "labeled_products_by_behavior.zip")

In [7]:
# === Load data ===
with zipfile.ZipFile(train_zip_path, "r") as zipf:
    with zipf.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, "r") as zipf:
    with zipf.open("test_df.csv") as f:
        test_df = pd.read_csv(f)

with zipfile.ZipFile(cluster_zip_path, "r") as zipf:
    with zipf.open("labeled_products_by_behavior.csv") as f:
        cluster_df = pd.read_csv(f)

cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["dbscan_cluster"]))

In [8]:
# === Prepare item sequences ===
user_sequences = []
item_id_map, reverse_item_id_map = {}, {}
item_counter = 0

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    try:
        items = list(eval(row["products"]).keys())
    except:
        continue
    encoded = []
    for item in items:
        if item not in item_id_map:
            item_id_map[item] = item_counter
            reverse_item_id_map[item_counter] = item
            item_counter += 1
        encoded.append(item_id_map[item])
    if len(encoded) > 1:
        user_sequences.append(encoded)

100%|██████████| 412999/412999 [00:43<00:00, 9573.33it/s] 


In [9]:
# === Build training dataset ===
max_len = 10
train_sequences, target_items = [], []

for seq in user_sequences:
    for i in range(1, len(seq)):
        train_sequences.append(seq[:i][-max_len:])
        target_items.append(seq[i])

pad_token = item_counter

class BERT4RecDataset(Dataset):
    def __init__(self, sequences, targets, pad_token, max_len):
        self.sequences = sequences
        self.targets = targets
        self.pad_token = pad_token
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        padded = [self.pad_token] * (self.max_len - len(seq)) + seq
        return torch.tensor(padded), torch.tensor(self.targets[idx])

train_dataset = BERT4RecDataset(train_sequences, target_items, pad_token, max_len)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

In [10]:
# === Define model ===
class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=2, max_len=10):
        super().__init__()
        self.embed = nn.Embedding(vocab_size + 1, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x).permute(1, 0, 2)
        x = self.encoder(x)
        return self.output(x[-1])

# === Train model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BERT4Rec(vocab_size=len(item_id_map)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()



In [11]:
for epoch in range(15):
    model.train()
    total_loss = 0
    for input_seq, target in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_seq, target = input_seq.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(input_seq)
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📉 Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

Epoch 1: 100%|██████████| 5417/5417 [00:55<00:00, 98.00it/s] 


📉 Epoch 1 Loss: 6.2224


Epoch 2: 100%|██████████| 5417/5417 [00:51<00:00, 104.40it/s]


📉 Epoch 2 Loss: 5.6787


Epoch 3: 100%|██████████| 5417/5417 [00:51<00:00, 105.32it/s]


📉 Epoch 3 Loss: 5.5227


Epoch 4: 100%|██████████| 5417/5417 [00:55<00:00, 98.21it/s]


📉 Epoch 4 Loss: 5.4227


Epoch 5: 100%|██████████| 5417/5417 [00:52<00:00, 104.15it/s]


📉 Epoch 5 Loss: 5.3491


Epoch 6: 100%|██████████| 5417/5417 [00:50<00:00, 106.62it/s]


📉 Epoch 6 Loss: 5.2891


Epoch 7: 100%|██████████| 5417/5417 [00:50<00:00, 106.41it/s]


📉 Epoch 7 Loss: 5.2410


Epoch 8: 100%|██████████| 5417/5417 [00:50<00:00, 107.14it/s]


📉 Epoch 8 Loss: 5.1985


Epoch 9: 100%|██████████| 5417/5417 [00:50<00:00, 106.71it/s]


📉 Epoch 9 Loss: 5.1640


Epoch 10: 100%|██████████| 5417/5417 [00:50<00:00, 106.56it/s]


📉 Epoch 10 Loss: 5.1342


Epoch 11: 100%|██████████| 5417/5417 [00:50<00:00, 107.21it/s]


📉 Epoch 11 Loss: 5.1072


Epoch 12: 100%|██████████| 5417/5417 [00:50<00:00, 107.02it/s]


📉 Epoch 12 Loss: 5.0850


Epoch 13: 100%|██████████| 5417/5417 [00:50<00:00, 106.93it/s]


📉 Epoch 13 Loss: 5.0612


Epoch 14: 100%|██████████| 5417/5417 [00:51<00:00, 106.02it/s]


📉 Epoch 14 Loss: 5.0420


Epoch 15: 100%|██████████| 5417/5417 [00:50<00:00, 107.20it/s]

📉 Epoch 15 Loss: 5.0231





In [14]:
# === Inference with DBSCAN awareness ===
def recommend_bert(input_items, cluster_map, top_k=30):
    model.eval()
    seq = input_items[-max_len:]
    padded = [pad_token] * (max_len - len(seq)) + seq
    input_tensor = torch.tensor(padded).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = model(input_tensor).squeeze()
        scores = logits.cpu().numpy()

    cluster_ids = [cluster_map.get(reverse_item_id_map[i]) for i in seq if reverse_item_id_map.get(i) in cluster_map]
    if not cluster_ids:
        top_items = np.argsort(scores)[::-1][:top_k]
        return [reverse_item_id_map[i] for i in top_items if i in reverse_item_id_map]

    top_cluster = pd.Series(cluster_ids).value_counts().idxmax()
    ranked = [(i, s * (1.2 if cluster_map.get(reverse_item_id_map.get(i)) == top_cluster else 1.0))
              for i, s in enumerate(scores) if i in reverse_item_id_map]

    top_items = sorted(ranked, key=lambda x: x[1], reverse=True)[:top_k]
    return [reverse_item_id_map[i] for i, _ in top_items]

In [15]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0.0

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_metric(metric_fn, actual_list, pred_list, k):
    return np.mean([metric_fn(a, p, k) for a, p in zip(actual_list, pred_list)])

In [None]:
# === Evaluate on test set ===
print("\n🔍 Evaluating BERT4Rec with DBSCAN re-ranking...")
actual_orders, predicted_orders = [], []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    try:
        items = list(eval(row["products"]).keys())
        if len(items) < 2:
            continue
        basket = items[:len(items)//2]
        actual = items[len(items)//2:]
        input_seq = [item_id_map[i] for i in basket if i in item_id_map]
        predicted = recommend_bert(input_seq, cluster_map)
        actual_orders.append(actual)
        predicted_orders.append(predicted)
    except:
        continue


🔍 Evaluating BERT4Rec with DBSCAN re-ranking...


100%|██████████| 80000/80000 [09:09<00:00, 145.53it/s]


In [None]:
# === Final metrics ===
ndcg_20 = mean_metric(ndcg_at_k, actual_orders, predicted_orders, k=20)
recall_20 = mean_metric(recall_at_k, actual_orders, predicted_orders, k=20)

print(f"\n📈 BERT4Rec + DBSCAN nDCG@20: {ndcg_20:.4f}")
print(f"📈 BERT4Rec + DBSCAN Recall@20: {recall_20:.4f}")


📈 BERT4Rec + DBSCAN nDCG@20: 0.2432
📈 BERT4Rec + DBSCAN Recall@20: 0.3795


In [16]:
# === Evaluate on test set ===
print("\n🔍 Evaluating BERT4Rec + HDBSCAN")
cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["hdbscan_cluster"]))
actual_orders, predicted_orders = [], []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    try:
        items = list(eval(row["products"]).keys())
        if len(items) < 2:
            continue
        basket = items[:len(items)//2]
        actual = items[len(items)//2:]
        input_seq = [item_id_map[i] for i in basket if i in item_id_map]
        predicted = recommend_bert(input_seq, cluster_map)
        actual_orders.append(actual)
        predicted_orders.append(predicted)
    except:
        continue


🔍 Evaluating BERT4Rec + HDBSCAN


100%|██████████| 80000/80000 [09:43<00:00, 137.18it/s]


In [17]:
ndcg_20_hdb = mean_metric(ndcg_at_k, actual_orders, predicted_orders, k=20)
recall_20_hdb = mean_metric(recall_at_k, actual_orders, predicted_orders, k=20)

print(f"\n📈 BERT4Rec + HDBSCAN nDCG@20: {ndcg_20_hdb:.4f}")
print(f"📈 BERT4Rec + HDBSCAN Recall@20: {recall_20_hdb:.4f}")


📈 BERT4Rec + HDBSCAN nDCG@20: 0.2473
📈 BERT4Rec + HDBSCAN Recall@20: 0.3790
