In [None]:
# ====================================
# Notebook 6.2: Item2Item + MiniLM/SBERT re-ranking
# Description:
# This notebook trains a cosine similarity Item2Item model on binary user-item matrix
# and re-ranks the output using semantic similarity based on MiniLM and SBERT embeddings.
# Evaluated using nDCG@20 and Recall@20.
# ====================================

In [None]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 133 (delta 43), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (133/133), 201.78 MiB | 13.61 MiB/s, done.
Resolving deltas: 100% (43/43), done.


In [None]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m117.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.5/768.5 MB[0m [31m756.2 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install implicit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pickle
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import load_npz, csr_matrix
from implicit.nearest_neighbours import CosineRecommender
from sklearn.preprocessing import normalize

In [None]:
# === Set paths ===
processed_data = os.path.join(repo_dir, 'data', 'processed')
artifacts_dir = os.path.join(repo_dir, 'artifacts')

binary_matrix_path = os.path.join(processed_data, 'user_item_binary_matrix.npz')
train_zip_path = os.path.join(processed_data, 'train_df.zip')
test_zip_path = os.path.join(processed_data, 'test_df.zip')
map_path = os.path.join(artifacts_dir, 'mapping.pkl')

minilm_zip1 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part1.zip')
minilm_zip2 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part2.zip')
sbert_zip1 = os.path.join(artifacts_dir, 'item_embeddings_sbert_part1.zip')
sbert_zip2 = os.path.join(artifacts_dir, 'item_embeddings_sbert_part2.zip')

In [None]:
# === Load datasets ===
user_item_binary_matrix = load_npz(binary_matrix_path)

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    with zip_ref.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    with zip_ref.open("test_df.csv") as f:
        test_df = pd.read_csv(f)

with open(map_path, 'rb') as f:
    mappings = pickle.load(f)
user_map = mappings['user_map']
item_map = mappings['item_map']
reverse_item_map = mappings['reverse_item_map']

print("✅ Binary matrix and data loaded.")

def load_embeddings_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        filename = zipf.namelist()[0]
        with zipf.open(filename) as f:
            return pickle.load(f)

emb_minilm_1 = load_embeddings_from_zip(minilm_zip1)
emb_minilm_2 = load_embeddings_from_zip(minilm_zip2)
item_embeddings_minilm = {**emb_minilm_1, **emb_minilm_2}
print(f"✅ MiniLM embeddings loaded: {len(item_embeddings_minilm)} items.")

emb_sbert_1 = load_embeddings_from_zip(sbert_zip1)
emb_sbert_2 = load_embeddings_from_zip(sbert_zip2)
item_embeddings_sbert = {**emb_sbert_1, **emb_sbert_2}
print(f"✅ SBERT embeddings loaded: {len(item_embeddings_sbert)} items.")

✅ Binary matrix and data loaded.
✅ MiniLM embeddings loaded: 12881 items.
✅ SBERT embeddings loaded: 12881 items.


In [None]:
# === Train Item2Item model ===
item_model = CosineRecommender(K=10)
item_model.fit(user_item_binary_matrix)
print("✅ Item2Item model trained.")



  0%|          | 0/12881 [00:00<?, ?it/s]

✅ Item2Item model trained.


In [None]:
# === Recommended functions ===
def make_coo_row(products):
    row_data, row_cols = [], []
    for item, quantity in products.items():
        if item in item_map:
            q = quantity[0] if isinstance(quantity, list) else quantity
            row_cols.append(item_map[item])
            row_data.append(float(q))
    return csr_matrix((row_data, ([0]*len(row_data), row_cols)), shape=(1, len(item_map)))

def recommend_with_embeddings(products, embeddings, n=30):
    row = make_coo_row(products).tocsr()
    recs = item_model.recommend(
        userid=0,
        user_items=row,
        N=n,
        filter_already_liked_items=True,
        recalculate_user=True
    )
    candidate_items = [reverse_item_map[idx] for idx in recs[0]]

    basket_items = list(products.keys())
    basket_embeddings = [embeddings[item] for item in basket_items if item in embeddings]
    if not basket_embeddings:
        return candidate_items[:n]

    basket_embedding = np.mean(basket_embeddings, axis=0)
    similarities = []
    for item in candidate_items:
        if item in embeddings:
            sim = np.dot(basket_embedding, embeddings[item])
        else:
            sim = -1
        similarities.append((item, sim))

    ranked_items = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked_items[:n]]

In [None]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mean_ndcg_at_k(actual_list, predicted_list, k=20):
    return np.mean([ndcg_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_recall_at_k(actual_list, predicted_list, k=20):
    return np.mean([recall_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [None]:
# === Evaluate on test set ===
actual_orders = []
predicted_minilm = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="📊 Evaluating Item2Item + MiniLM"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
        if len(product_ids) < 2:
            continue
        basket = product_ids[:len(product_ids)//2]
        target = product_ids[len(product_ids)//2:]
        predicted = recommend_with_embeddings({item: products_dict[item] for item in basket}, item_embeddings_minilm)
        actual_orders.append(target)
        predicted_minilm.append(predicted)
    except:
        continue

📊 Evaluating Item2Item + MiniLM: 100%|██████████| 80000/80000 [00:28<00:00, 2776.61it/s]


In [None]:
# === Final metrics ===
ndcg_20_minilm = mean_ndcg_at_k(actual_orders, predicted_minilm)
recall_20_minilm = mean_recall_at_k(actual_orders, predicted_minilm)

print(f"\n📈 Item2Item + MiniLM nDCG@20: {ndcg_20_minilm:.4f}")
print(f"📈 Item2Item + MiniLM Recall@20: {recall_20_minilm:.4f}")


📈 Item2Item + MiniLM nDCG@20: 0.1086
📈 Item2Item + MiniLM Recall@20: 0.2410


In [None]:
# === Evaluate on test set ===
predicted_sbert = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="📊 Evaluating Item2Item + SBERT"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
        if len(product_ids) < 2:
            continue
        basket = product_ids[:len(product_ids)//2]
        predicted = recommend_with_embeddings({item: products_dict[item] for item in basket}, item_embeddings_sbert)
        predicted_sbert.append(predicted)
    except:
        continue

📊 Evaluating Item2Item + SBERT: 100%|██████████| 80000/80000 [00:30<00:00, 2623.50it/s]


In [None]:
# === Final metrics ===
ndcg_20_sbert = mean_ndcg_at_k(actual_orders, predicted_sbert)
recall_20_sbert = mean_recall_at_k(actual_orders, predicted_sbert)

print(f"\n📈 Item2Item + SBERT nDCG@20: {ndcg_20_sbert:.4f}")
print(f"📈 Item2Item + SBERT Recall@20: {recall_20_sbert:.4f}")


📈 Item2Item + SBERT nDCG@20: 0.1081
📈 Item2Item + SBERT Recall@20: 0.2378
