In [1]:
# ====================================
# Notebook 6.1: Item2Item based on Binary / Quantity Matrix
# Description:
# This notebook trains cosine similarity-based item-item models on binary and quantity user-item matrices.
# The models are evaluated using nDCG@20 and Recall@20.
# ====================================

In [18]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

My-BS-Thesis already exists. Removing it...

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 133 (delta 43), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (133/133), 201.78 MiB | 19.72 MiB/s, done.
Resolving deltas: 100% (43/43), done.


In [19]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.5/768.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install implicit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import pickle
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import load_npz, csr_matrix
from implicit.nearest_neighbours import CosineRecommender

In [5]:
# === Set paths ===
processed_data = os.path.join(repo_dir, 'data', 'processed')
artifacts_dir = os.path.join(repo_dir, 'artifacts')

binary_matrix_path = os.path.join(processed_data, 'user_item_binary_matrix.npz')
quantity_matrix_path = os.path.join(processed_data, 'user_item_quantity_matrix.npz')
train_zip_path = os.path.join(processed_data, 'train_df.zip')
test_zip_path = os.path.join(processed_data, 'test_df.zip')
map_path = os.path.join(artifacts_dir, 'mapping.pkl')

In [6]:
# === Load data ===
user_item_binary_matrix = load_npz(binary_matrix_path)
user_item_quantity_matrix = load_npz(quantity_matrix_path)
print("✅ User-item matrices loaded.")

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    with zip_ref.open("train_df.csv") as f:
        train_df = pd.read_csv(f)

with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    with zip_ref.open("test_df.csv") as f:
        test_df = pd.read_csv(f)
print("✅ Train and test data loaded.")

with open(map_path, 'rb') as f:
    mappings = pickle.load(f)
user_map = mappings['user_map']
item_map = mappings['item_map']
reverse_item_map = mappings['reverse_item_map']

✅ User-item matrices loaded.
✅ Train and test data loaded.


In [10]:
# === Train and evaluate on BINARY matrix ===
print("\n🔧 Training Item2Item model on BINARY matrix...")
item_model = CosineRecommender(K=10)
item_model.fit(user_item_binary_matrix)
print("✅ Item2Item model trained.")


🔧 Training Item2Item model on BINARY matrix...




  0%|          | 0/12881 [00:00<?, ?it/s]

✅ Item2Item model trained.


In [11]:
# === Recommended functions ===
def make_coo_row(products):
    row_data, row_cols = [], []
    for item, quantity in products.items():
        if item in item_map:
            q = quantity[0] if isinstance(quantity, list) else quantity
            row_cols.append(item_map[item])
            row_data.append(float(q))
    return csr_matrix((row_data, ([0]*len(row_data), row_cols)), shape=(1, len(item_map)))

def recommend_items(model, products, n=30):
    row = make_coo_row(products).tocsr()
    recs = model.recommend(
        userid=0,
        user_items=row,
        N=n,
        filter_already_liked_items=True,
        recalculate_user=True
    )
    return [reverse_item_map[idx] for idx in recs[0]]

In [12]:
def ndcg_at_k(actual, predicted, k=20):
    dcg = sum(1 / np.log2(i + 2) for i, p in enumerate(predicted[:k]) if p in actual)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mean_ndcg_at_k(actual_list, predicted_list, k=20):
    return np.mean([ndcg_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

def recall_at_k(actual, predicted, k=20):
    return len(set(predicted[:k]) & set(actual)) / len(actual) if actual else 0.0

def mean_recall_at_k(actual_list, predicted_list, k=20):
    return np.mean([recall_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [13]:
# === Evaluate on test set ===
actual_orders = []
predicted_orders = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="📊 Evaluating BINARY model"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
    except:
        continue

    if len(product_ids) < 2:
        continue

    basket = product_ids[:len(product_ids)//2]
    target = product_ids[len(product_ids)//2:]

    predicted = recommend_items(item_model, {item: products_dict[item] for item in basket})
    actual_orders.append(target)
    predicted_orders.append(predicted)

📊 Evaluating BINARY model: 100%|██████████| 80000/80000 [00:27<00:00, 2961.93it/s]


In [14]:
# === Final metrics ===
ndcg_20_binary = mean_ndcg_at_k(actual_orders, predicted_orders)
recall_20_binary = mean_recall_at_k(actual_orders, predicted_orders)

print(f"\n📈 Item2Item (BINARY) nDCG@20: {ndcg_20_binary:.4f}")
print(f"📈 Item2Item (BINARY) Recall@20: {recall_20_binary:.4f}")


📈 Item2Item (BINARY) nDCG@20: 0.1754
📈 Item2Item (BINARY) Recall@20: 0.2481


In [15]:
# === Train and evaluate on QUANTITY matrix ===
print("\n🔧 Training Item2Item model on QUANTITY matrix...")
item_model = CosineRecommender(K=10)
item_model.fit(user_item_quantity_matrix)
print("✅ Item2Item model trained.")


🔧 Training Item2Item model on QUANTITY matrix...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]


  0%|          | 0/12881 [00:00<?, ?it/s]

✅ Item2Item model trained.


In [16]:
# === Evaluate on test set ===
actual_orders = []
predicted_orders = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="📊 Evaluating QUANTITY model"):
    try:
        products_dict = eval(row["products"])
        product_ids = list(products_dict.keys())
    except:
        continue

    if len(product_ids) < 2:
        continue

    basket = product_ids[:len(product_ids)//2]
    target = product_ids[len(product_ids)//2:]

    predicted = recommend_items(item_model, {item: products_dict[item] for item in basket})
    actual_orders.append(target)
    predicted_orders.append(predicted)

📊 Evaluating QUANTITY model: 100%|██████████| 80000/80000 [00:31<00:00, 2574.55it/s]


In [17]:
# === Final metrics ===
ndcg_20_quantity = mean_ndcg_at_k(actual_orders, predicted_orders)
recall_20_quantity = mean_recall_at_k(actual_orders, predicted_orders)

print(f"\n📈 Item2Item (QUANTITY) nDCG@20: {ndcg_20_quantity:.4f}")
print(f"📈 Item2Item (QUANTITY) Recall@20: {recall_20_quantity:.4f}")


📈 Item2Item (QUANTITY) nDCG@20: 0.1069
📈 Item2Item (QUANTITY) Recall@20: 0.1530
