In [6]:
# ====================================
# Notebook 6.3: Item2Item + MiniLM + Clustering (DBSCAN / HDBSCAN)
# Description:
# This notebook uses an Item2Item model on the binary user-item matrix.
# Re-ranking is performed with MiniLM semantic similarity and attention to DBSCAN and HDBSCAN clusters.
# Evaluated using nDCG@20 and Recall@20.
# ====================================

In [2]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 133 (delta 43), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (133/133), 201.78 MiB | 21.00 MiB/s, done.
Resolving deltas: 100% (43/43), done.


In [22]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.5/768.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
!pip install implicit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import pickle
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import load_npz, csr_matrix
from implicit.nearest_neighbours import CosineRecommender

In [7]:
# === Set paths ===
processed_data = os.path.join(repo_dir, 'data', 'processed')
interim_data = os.path.join(repo_dir, 'data', 'interim')
artifacts_dir = os.path.join(repo_dir, 'artifacts')

binary_matrix_path = os.path.join(processed_data, 'user_item_binary_matrix.npz')
train_zip_path = os.path.join(processed_data, 'train_df.zip')
test_zip_path = os.path.join(processed_data, 'test_df.zip')
map_path = os.path.join(artifacts_dir, 'mapping.pkl')

embedding_zip1 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part1.zip')
embedding_zip2 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part2.zip')

cluster_zip_path = os.path.join(interim_data, 'labeled_products_by_behavior.zip')

In [8]:
# === Load data ===
user_item_matrix = load_npz(binary_matrix_path)

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    with zip_ref.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    with zip_ref.open("test_df.csv") as f:
        test_df = pd.read_csv(f)

with open(map_path, 'rb') as f:
    mappings = pickle.load(f)

user_map = mappings['user_map']
item_map = mappings['item_map']
reverse_item_map = mappings['reverse_item_map']
print("✅ Binary matrix and data loaded.")

# === Load embeddings ===
def load_embeddings_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        filename = zipf.namelist()[0]
        with zipf.open(filename) as f:
            return pickle.load(f)

embeddings_1 = load_embeddings_from_zip(embedding_zip1)
embeddings_2 = load_embeddings_from_zip(embedding_zip2)
item_embeddings = {**embeddings_1, **embeddings_2}
print(f"✅ MiniLM embeddings loaded: {len(item_embeddings)} items.")

# === Load clustering labels from zip ===
with zipfile.ZipFile(cluster_zip_path, 'r') as zip_ref:
    with zip_ref.open("labeled_products_by_behavior.csv") as f:
        cluster_df = pd.read_csv(f)
print("✅ Сlustering labels loaded.")

✅ Binary matrix and data loaded.
✅ MiniLM embeddings loaded: 12881 items.
✅ Сlustering labels loaded.


In [9]:
# === Train Item2Item model ===
item_model = CosineRecommender(K=10)
item_model.fit(user_item_matrix)
print("✅ Item2Item model trained.")



  0%|          | 0/12881 [00:00<?, ?it/s]

✅ Item2Item model trained.


In [17]:
# === Recommended functions ===
def make_coo_row(products):
    row_data, row_cols = [], []
    for item, quantity in products.items():
        if item in item_map:
            q = quantity[0] if isinstance(quantity, list) else quantity
            row_cols.append(item_map[item])
            row_data.append(float(q))
    return csr_matrix((row_data, ([0]*len(row_data), row_cols)), shape=(1, len(item_map)))

def recommend_items_with_clusters(products, item_embeddings, cluster_map, n=30):
    row = make_coo_row(products).tocsr()
    recs = item_model.recommend(
        userid=0,
        user_items=row,
        N=n,
        filter_already_liked_items=True,
        recalculate_user=True
    )
    candidate_items = [reverse_item_map[idx] for idx in recs[0]]

    basket_items = list(products.keys())
    basket_embeddings = [item_embeddings[item] for item in basket_items if item in item_embeddings]
    if not basket_embeddings:
        return candidate_items[:n]

    basket_embedding = np.mean(basket_embeddings, axis=0)
    basket_clusters = [cluster_map.get(item) for item in basket_items if item in cluster_map]
    cluster_counts = pd.Series(basket_clusters).value_counts()
    top_clusters = cluster_counts.head(3).index.tolist()

    similarities = []
    for item in candidate_items:
        if item not in item_embeddings:
            continue
        item_cluster = cluster_map.get(item)
        if item_cluster not in top_clusters:
            continue
        sim = np.dot(basket_embedding, item_embeddings[item])
        cluster_boost = 1.2 if item_cluster == top_clusters[0] else 1.1 if item_cluster == top_clusters[1] else 1.05
        similarities.append((item, sim * cluster_boost))

    ranked_items = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked_items[:n]]

In [11]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mean_ndcg_at_k(actual_list, predicted_list, k=20):
    return np.mean([ndcg_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_recall_at_k(actual_list, predicted_list, k=20):
    return np.mean([recall_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [18]:
# === Evaluate on test set ===
cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["dbscan_cluster"]))

actual_orders = []
predicted_orders = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating with DBSCAN"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
        if len(product_ids) < 2:
            continue
        basket = product_ids[:len(product_ids)//2]
        target = product_ids[len(product_ids)//2:]
        basket_dict = {item: products_dict[item] for item in basket}
        predicted = recommend_items_with_clusters(basket_dict, item_embeddings, cluster_map)
        actual_orders.append(target)
        predicted_orders.append(predicted)
    except:
        continue

Evaluating with DBSCAN: 100%|██████████| 80000/80000 [00:57<00:00, 1380.12it/s]


In [19]:
# === Final metrics ===
ndcg_db = mean_ndcg_at_k(actual_orders, predicted_orders)
recall_db = mean_recall_at_k(actual_orders, predicted_orders)

print(f"\n📈 Item2Item + MiniLM + DBSCAN nDCG@20: {ndcg_db:.4f}")
print(f"📈 Item2Item + MiniLM + DBSCAN Recall@20: {recall_db:.4f}")


📈 Item2Item + MiniLM + DBSCAN nDCG@20: 0.0936
📈 Item2Item + MiniLM + DBSCAN Recall@20: 0.1991


In [20]:
# === Evaluate on test set ===
cluster_map = dict(zip(cluster_df["encoded_id"], cluster_df["hdbscan_cluster"]))

actual_orders = []
predicted_orders = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating with HDBSCAN"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
        if len(product_ids) < 2:
            continue
        basket = product_ids[:len(product_ids)//2]
        target = product_ids[len(product_ids)//2:]
        basket_dict = {item: products_dict[item] for item in basket}
        predicted = recommend_items_with_clusters(basket_dict, item_embeddings, cluster_map)
        actual_orders.append(target)
        predicted_orders.append(predicted)
    except:
        continue

Evaluating with HDBSCAN: 100%|██████████| 80000/80000 [01:05<00:00, 1217.01it/s]


In [21]:
# === Final metrics ===
ndcg_hdb = mean_ndcg_at_k(actual_orders, predicted_orders)
recall_hdb = mean_recall_at_k(actual_orders, predicted_orders)

print(f"\n📈 Item2Item + MiniLM + HDBSCAN nDCG@20: {ndcg_hdb:.4f}")
print(f"📈 Item2Item + MiniLM + HDBSCAN Recall@20: {recall_hdb:.4f}")


📈 Item2Item + MiniLM + HDBSCAN nDCG@20: 0.1049
📈 Item2Item + MiniLM + HDBSCAN Recall@20: 0.2307
