In [6]:
# ====================================
# Notebook 4: Global Top Recommender (Baseline)
# Description:
# This notebook implements a baseline recommender that always returns
# the globally most popular items. It is evaluated using nDCG@20 and Recall@20.
# ====================================

In [7]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

My-BS-Thesis already exists. Removing it...

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (126/126), done.[K
remote: Total 131 (delta 42), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (131/131), 201.78 MiB | 25.25 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [30]:
# # === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.5/768.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
import json
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

# === Set paths ===
data_interim = os.path.join(repo_dir, 'data', 'interim')
data_raw = os.path.join(repo_dir, 'data', 'raw')

zip_path = os.path.join(data_interim, "filtered_orders.zip")

In [14]:
# === Load data ===
import zipfile

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open("filtered_orders.csv") as f:
        df = pd.read_csv(f)

In [17]:
# === Split into train/test by shard ===
df["shard"] = df["id"] % 10
train_df = df[df["shard"] < 8]
test_df = df[df["shard"] >= 8]

In [20]:
# === Compute top-N popular items from training data ===
item_counter = Counter()
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Counting item frequencies"):
    products = list(eval(row["products"]).keys())
    item_counter.update(products)

top_30_items = [item for item, _ in item_counter.most_common(30)]

Counting item frequencies: 100%|██████████| 394329/394329 [00:45<00:00, 8686.25it/s]


In [24]:
def recommend_top_items(n=30):
    return top_30_items[:n]

In [25]:
item_counter.most_common(30)

[('1c', 17757),
 ('2a', 16309),
 ('61', 15405),
 ('1d', 13728),
 ('da3', 11562),
 ('142', 10297),
 ('63', 8159),
 ('1e', 7186),
 ('4', 6944),
 ('67', 6402),
 ('19', 6351),
 ('1cd', 5355),
 ('b', 4906),
 ('fe', 4893),
 ('e', 4794),
 ('8', 4711),
 ('1117', 4659),
 ('bb', 4624),
 ('40', 4550),
 ('15a4', 4364),
 ('a', 4106),
 ('16', 4087),
 ('b6', 3691),
 ('44', 3610),
 ('16b', 3528),
 ('41', 3527),
 ('1ce', 3509),
 ('38', 3504),
 ('190', 3317),
 ('df', 3279)]

In [26]:
# === Define evaluation metrics ===
def ndcg_at_k(actual, predicted, k=20):
    dcg = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1 / np.log2(i + 2)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mean_ndcg_at_k(actual_list, predicted_list, k=20):
    return np.mean([ndcg_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

def recall_at_k(actual, predicted, k=20):
    if not actual:
        return 0.0
    return len(set(predicted[:k]) & set(actual)) / len(actual)

def mean_recall_at_k(actual_list, predicted_list, k=20):
    return np.mean([recall_at_k(a, p, k) for a, p in zip(actual_list, predicted_list)])

In [27]:
# === Evaluate baseline on test set using order splitting ===
actual_orders = []
recommended_orders = []

skipped_orders = 0

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="🔍 Evaluating baseline model"):
    try:
        full_order = list(eval(row["products"]).keys())
    except:
        continue

    if len(full_order) <= 1:
        skipped_orders += 1
        continue

    split_point = len(full_order) // 2
    basket = full_order[:split_point]
    target = full_order[split_point:]

    predicted = recommend_top_items(n=30)
    actual_orders.append(target)
    recommended_orders.append(predicted)

🔍 Evaluating baseline model: 100%|██████████| 98670/98670 [00:11<00:00, 8679.23it/s]


In [28]:
# === Final metrics ===
ndcg_20 = mean_ndcg_at_k(actual_orders, recommended_orders, k=20)
recall_20 = mean_recall_at_k(actual_orders, recommended_orders, k=20)

print(f"\n📈 nDCG@20: {ndcg_20:.4f}")
print(f"📈 Recall@20: {recall_20:.4f}")


📈 nDCG@20: 0.1146
📈 Recall@20: 0.1908
