In [1]:
# ====================================
# Notebook 8.3: BERT4Rec + MiniLM + Clustering (DBSCAN & HDBSCAN)
# Description:
# This notebook trains a BERT4Rec model on order sequences and re-ranks
# using MiniLM embeddings and cluster attention (DBSCAN and HDBSCAN).
# Evaluated using nDCG@20 and Recall@20.
# ====================================

In [2]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (146/146), done.[K
remote: Total 153 (delta 54), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (153/153), 201.81 MiB | 23.99 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [3]:
# === Install dependencies ===
!pip install -q torch transformers scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# # === Install dependencies from requirements.txt ===
# !pip install -r My-BS-Thesis/requirements.txt -q

In [5]:
import os
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [6]:
# === Set paths ===
processed_data = os.path.join(repo_dir, "data", "processed")
interim_data = os.path.join(repo_dir, "data", "interim")
artifacts_dir = os.path.join(repo_dir, "artifacts")

train_zip_path = os.path.join(processed_data, "train_df.zip")
test_zip_path = os.path.join(processed_data, "test_df.zip")
cluster_zip_path = os.path.join(interim_data, "labeled_products_by_behavior.zip")

embedding_zip1 = os.path.join(artifacts_dir, "item_embeddings_minilm_part1.zip")
embedding_zip2 = os.path.join(artifacts_dir, "item_embeddings_minilm_part2.zip")

In [7]:
# === Load data ===
with zipfile.ZipFile(train_zip_path, "r") as zipf:
    with zipf.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, "r") as zipf:
    with zipf.open("test_df.csv") as f:
        test_df = pd.read_csv(f)

with zipfile.ZipFile(cluster_zip_path, "r") as zipf:
    with zipf.open("labeled_products_by_behavior.csv") as f:
        cluster_df = pd.read_csv(f)

# === Load MiniLM embeddings ===
def load_embeddings(zip_path):
    with zipfile.ZipFile(zip_path, "r") as zipf:
        name = zipf.namelist()[0]
        with zipf.open(name) as f:
            return pickle.load(f)

embeddings_1 = load_embeddings(embedding_zip1)
embeddings_2 = load_embeddings(embedding_zip2)
item_embeddings = {**embeddings_1, **embeddings_2}

cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["dbscan_cluster"]))

In [8]:
# === Prepare training sequences ===
user_sequences = []
item_id_map, reverse_item_id_map = {}, {}
item_counter = 0

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    try:
        items = list(eval(row["products"]).keys())
    except:
        continue
    encoded = []
    for item in items:
        if item not in item_id_map:
            item_id_map[item] = item_counter
            reverse_item_id_map[item_counter] = item
            item_counter += 1
        encoded.append(item_id_map[item])
    if len(encoded) > 1:
        user_sequences.append(encoded)

100%|██████████| 412999/412999 [00:46<00:00, 8835.84it/s] 


In [9]:
# === Create training dataset ===
max_len = 10
train_sequences, target_items = [], []

for seq in user_sequences:
    for i in range(1, len(seq)):
        train_sequences.append(seq[:i][-max_len:])
        target_items.append(seq[i])

pad_token = item_counter

class BERT4RecDataset(Dataset):
    def __init__(self, sequences, targets, pad_token, max_len):
        self.sequences = sequences
        self.targets = targets
        self.pad_token = pad_token
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        padded = [self.pad_token] * (self.max_len - len(seq)) + seq
        return torch.tensor(padded), torch.tensor(self.targets[idx])

train_dataset = BERT4RecDataset(train_sequences, target_items, pad_token, max_len)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

In [10]:
# === BERT4Rec model ===
class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=2, max_len=10):
        super().__init__()
        self.embed = nn.Embedding(vocab_size + 1, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x).permute(1, 0, 2)
        x = self.encoder(x)
        return self.output(x[-1])

# === Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT4Rec(vocab_size=len(item_id_map)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()



In [11]:
for epoch in range(15):
    model.train()
    total_loss = 0
    for input_seq, target in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_seq, target = input_seq.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(input_seq)
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📉 Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

Epoch 1: 100%|██████████| 5417/5417 [00:53<00:00, 102.06it/s]


📉 Epoch 1 Loss: 6.2007


Epoch 2: 100%|██████████| 5417/5417 [00:51<00:00, 104.29it/s]


📉 Epoch 2 Loss: 5.6647


Epoch 3: 100%|██████████| 5417/5417 [00:51<00:00, 104.61it/s]


📉 Epoch 3 Loss: 5.5100


Epoch 4: 100%|██████████| 5417/5417 [00:51<00:00, 105.93it/s]


📉 Epoch 4 Loss: 5.4118


Epoch 5: 100%|██████████| 5417/5417 [00:51<00:00, 106.08it/s]


📉 Epoch 5 Loss: 5.3374


Epoch 6: 100%|██████████| 5417/5417 [00:51<00:00, 104.60it/s]


📉 Epoch 6 Loss: 5.2789


Epoch 7: 100%|██████████| 5417/5417 [00:51<00:00, 104.39it/s]


📉 Epoch 7 Loss: 5.2313


Epoch 8: 100%|██████████| 5417/5417 [00:51<00:00, 105.44it/s]


📉 Epoch 8 Loss: 5.1907


Epoch 9: 100%|██████████| 5417/5417 [00:51<00:00, 105.54it/s]


📉 Epoch 9 Loss: 5.1547


Epoch 10: 100%|██████████| 5417/5417 [00:51<00:00, 105.89it/s]


📉 Epoch 10 Loss: 5.1244


Epoch 11: 100%|██████████| 5417/5417 [00:51<00:00, 105.20it/s]


📉 Epoch 11 Loss: 5.0975


Epoch 12: 100%|██████████| 5417/5417 [00:51<00:00, 106.03it/s]


📉 Epoch 12 Loss: 5.0735


Epoch 13: 100%|██████████| 5417/5417 [00:51<00:00, 105.83it/s]


📉 Epoch 13 Loss: 5.0531


Epoch 14: 100%|██████████| 5417/5417 [00:51<00:00, 106.00it/s]


📉 Epoch 14 Loss: 5.0328


Epoch 15: 100%|██████████| 5417/5417 [00:51<00:00, 104.84it/s]

📉 Epoch 15 Loss: 5.0136





In [12]:
# === Re-ranking with cluster attention and MiniLM ===
def recommend_bert(input_items, cluster_map, top_k=30):
    model.eval()
    seq = input_items[-max_len:]
    padded = [pad_token] * (max_len - len(seq)) + seq
    input_tensor = torch.tensor(padded).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = model(input_tensor).squeeze()
        scores = logits.cpu().numpy()

    candidate_indices = np.argsort(scores)[::-1][:100]
    candidate_items = [reverse_item_id_map[i] for i in candidate_indices if i in reverse_item_id_map]

    basket_embeds = [item_embeddings.get(reverse_item_id_map[i]) for i in seq if reverse_item_id_map.get(i) in item_embeddings]
    if not basket_embeds:
        return candidate_items[:top_k]
    basket_vector = np.mean(basket_embeds, axis=0).reshape(1, -1)

    scored_items = []
    cluster_ids = [cluster_map.get(reverse_item_id_map[i]) for i in seq if reverse_item_id_map.get(i) in cluster_map]
    top_cluster = pd.Series(cluster_ids).value_counts().idxmax() if cluster_ids else None

    for i in candidate_indices:
        prod_id = reverse_item_id_map.get(i)
        if prod_id not in item_embeddings:
            continue
        item_vector = item_embeddings[prod_id].reshape(1, -1)
        sim = cosine_similarity(basket_vector, item_vector)[0][0]
        prod_cluster = cluster_map.get(prod_id, -1)
        boost = 1.2 if prod_cluster == top_cluster else 1.0
        final_score = sim * boost
        scored_items.append((prod_id, final_score))

    scored_items.sort(key=lambda x: x[1], reverse=True)
    return [item for item, _ in scored_items[:top_k]]

In [13]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0.0

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_metric(metric_fn, actual_list, pred_list, k):
    return np.mean([metric_fn(a, p, k) for a, p in zip(actual_list, pred_list)])

In [None]:
# === Evaluate on test set ===
print("\n🔍 Evaluating BERT4Rec with DBSCAN re-ranking...")
actual_orders, predicted_orders = [], []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    try:
        items = list(eval(row["products"]).keys())
        if len(items) < 2:
            continue
        basket = items[:len(items)//2]
        actual = items[len(items)//2:]
        input_seq = [item_id_map[i] for i in basket if i in item_id_map]
        predicted = recommend_bert(input_seq, cluster_map)
        actual_orders.append(actual)
        predicted_orders.append(predicted)
    except:
        continue


🔍 Evaluating BERT4Rec with DBSCAN re-ranking...


100%|██████████| 80000/80000 [31:33<00:00, 42.26it/s]


In [None]:
# === Final metrics ===
ndcg_20 = mean_metric(ndcg_at_k, actual_orders, predicted_orders, k=20)
recall_20 = mean_metric(recall_at_k, actual_orders, predicted_orders, k=20)

print(f"\n📈 BERT4Rec + DBSCAN nDCG@20: {ndcg_20:.4f}")
print(f"📈 BERT4Rec + DBSCAN Recall@20: {recall_20:.4f}")


📈 BERT4Rec + DBSCAN nDCG@20: 0.0744
📈 BERT4Rec + DBSCAN Recall@20: 0.1643


In [None]:
# === Evaluate on test set ===
print("\n🔍 Evaluating BERT4Rec + HDBSCAN")
cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["hdbscan_cluster"]))
actual_orders, predicted_orders = [], []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    try:
        items = list(eval(row["products"]).keys())
        if len(items) < 2:
            continue
        basket = items[:len(items)//2]
        actual = items[len(items)//2:]
        input_seq = [item_id_map[i] for i in basket if i in item_id_map]
        predicted = recommend_bert(input_seq, cluster_map)
        actual_orders.append(actual)
        predicted_orders.append(predicted)
    except:
        continue

In [None]:
ndcg_20_hdb = mean_metric(ndcg_at_k, actual_orders, predicted_orders, k=20)
recall_20_hdb = mean_metric(recall_at_k, actual_orders, predicted_orders, k=20)

print(f"\n📈 BERT4Rec + HDBSCAN nDCG@20: {ndcg_20_hdb:.4f}")
print(f"📈 BERT4Rec + HDBSCAN Recall@20: {recall_20_hdb:.4f}")