In [None]:
# ====================================
# Notebook 5.2: ALS + MiniLM re-ranking
# Description:
# This notebook trains an ALS model on the binary user-item matrix and
# re-ranks its recommendations using MiniLM semantic similarity.
# Evaluated using nDCG@20 and Recall@20.
# ====================================

In [None]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (126/126), done.[K
remote: Total 131 (delta 42), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (131/131), 201.78 MiB | 25.48 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [None]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m121.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m125.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install implicit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import random
import pickle
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import load_npz, csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import normalize

In [None]:
# === Set paths ===
processed_data = os.path.join(repo_dir, 'data', 'processed')
artifacts_dir = os.path.join(repo_dir, 'artifacts')

binary_matrix_path = os.path.join(processed_data, 'user_item_binary_matrix.npz')
train_zip_path = os.path.join(processed_data, 'train_df.zip')
test_zip_path = os.path.join(processed_data, 'test_df.zip')
map_path = os.path.join(artifacts_dir, 'mapping.pkl')

# MiniLM embeddings (split into 2 zip parts)
embedding_zip1 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part1.zip')
embedding_zip2 = os.path.join(artifacts_dir, 'item_embeddings_minilm_part2.zip')

In [None]:
# === Load matrices and data ===
user_item_binary_matrix = load_npz(binary_matrix_path)
print("✅ Binary matrix loaded.")

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    with zip_ref.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    with zip_ref.open("test_df.csv") as f:
        test_df = pd.read_csv(f)
print("✅ Train and test data loaded.")

with open(map_path, 'rb') as f:
    mappings = pickle.load(f)
user_map = mappings['user_map']
item_map = mappings['item_map']
reverse_item_map = mappings['reverse_item_map']

# === Load and merge MiniLM embeddings ===
def load_embeddings_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        filename = zipf.namelist()[0]
        with zipf.open(filename) as f:
            return pickle.load(f)

embeddings_part1 = load_embeddings_from_zip(embedding_zip1)
embeddings_part2 = load_embeddings_from_zip(embedding_zip2)
item_embeddings = {**embeddings_part1, **embeddings_part2}
print(f"✅ MiniLM embeddings loaded: {len(item_embeddings)} items.")

✅ Binary matrix loaded.
✅ Train and test data loaded.
✅ MiniLM embeddings loaded: 12881 items.


In [None]:
# === Train ALS model ===
print("\n🔧 Training ALS model...")
als_model = AlternatingLeastSquares(factors=16, iterations=8, regularization=0.0)
als_model.fit(user_item_binary_matrix)
print("✅ ALS model trained.")


🔧 Training ALS model...


  check_blas_config()


  0%|          | 0/8 [00:00<?, ?it/s]

✅ ALS model trained.


In [None]:
# === Recommended functions ===
def make_coo_row(products):
    row_data, row_cols = [], []
    for item, quantity in products.items():
        if item in item_map:
            q = quantity[0] if isinstance(quantity, list) else quantity
            row_cols.append(item_map[item])
            row_data.append(float(q))
    return csr_matrix((row_data, ([0]*len(row_data), row_cols)), shape=(1, len(item_map)))

def recommend_items(products, n=30):
    row = make_coo_row(products).tocsr()
    raw_recs = als_model.recommend(
        userid=0,
        user_items=row,
        N=n,
        filter_already_liked_items=True,
        recalculate_user=True
    )
    candidate_items = [reverse_item_map[idx] for idx in raw_recs[0]]

    basket_items = list(products.keys())
    basket_embeddings = [item_embeddings[item] for item in basket_items if item in item_embeddings]
    if not basket_embeddings:
        return candidate_items[:n]

    basket_embedding = np.mean(basket_embeddings, axis=0)
    similarities = []
    for item in candidate_items:
        if item in item_embeddings:
            sim = np.dot(basket_embedding, item_embeddings[item])
        else:
            sim = -1
        similarities.append((item, sim))

    ranked_items = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked_items[:n]]

In [None]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mean_ndcg_at_k(actual_list, predicted_list, k=20):
    return np.mean([ndcg_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_recall_at_k(actual_list, predicted_list, k=20):
    return np.mean([recall_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [None]:
# === Evaluate on test set ===
actual_orders = []
predicted_orders = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="📊 Evaluating ALS+MiniLM"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
    except:
        continue

    if len(product_ids) < 2:
        continue

    basket = product_ids[:len(product_ids)//2]
    target = product_ids[len(product_ids)//2:]
    predicted = recommend_items({item: products_dict[item] for item in basket})

    actual_orders.append(target)
    predicted_orders.append(predicted)

📊 Evaluating ALS+MiniLM: 100%|██████████| 80000/80000 [00:53<00:00, 1492.12it/s]


In [None]:
# === Final metrics ===
ndcg_20 = mean_ndcg_at_k(actual_orders, predicted_orders)
recall_20 = mean_recall_at_k(actual_orders, predicted_orders)

print(f"\n📈 ALS + MiniLM nDCG@20: {ndcg_20:.4f}")
print(f"📈 ALS + MiniLM Recall@20: {recall_20:.4f}")


📈 ALS + MiniLM nDCG@20: 0.0852
📈 ALS + MiniLM Recall@20: 0.1831
