In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
all_data = pd.read_parquet("train_cleaned.parquet")

Split the data into train and test sets based on search sessions.

In [None]:
from scipy.sparse import csr_matrix
# Implicit rating (booking is stronger than click)
all_data['interaction'] = all_data['click_bool'] + all_data['booking_bool'] * 5

# Encode srch_id and prop_id to 0-based indexes for matrix rows/cols
user_ids = all_data['srch_id'].astype("category").cat.codes
item_ids = all_data['prop_id'].astype("category").cat.codes

all_data['user_index'] = user_ids
all_data['item_index'] = item_ids

# Build sparse matrix
interaction_matrix = csr_matrix((all_data['interaction'], (all_data['user_index'], all_data['item_index'])))

In [None]:
from sklearn.model_selection import train_test_split

unique_users = all_data['user_index'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

train_mask = all_data['user_index'].isin(train_users)
test_mask = all_data['user_index'].isin(test_users)

train_data = all_data[train_mask]
test_data = all_data[test_mask]

train_matrix = csr_matrix((train_data['interaction'], (train_data['user_index'], train_data['item_index'])))
test_matrix = csr_matrix((test_data['interaction'], (test_data['user_index'], test_data['item_index'])))

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
knn.fit(train_matrix)


In [None]:
# 6. Define NDCG@5 function
def ndcg_at_k(recommended, relevant, k=5):
    dcg = 0.0
    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            dcg += 1 / np.log2(i + 2)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
    return dcg / idcg if idcg > 0 else 0.0

# 7. Predict top items for test users
test_user_indices = test_data['user_index'].unique()

def compute_ndcg(user_idx):
    if test_matrix[user_idx].nnz == 0:
        return None  # skip users with no test data

    user_vector = test_matrix[user_idx]
    distances, neighbor_idxs = knn.kneighbors(user_vector, return_distance=True)
    neighbor_vectors = train_matrix[neighbor_idxs.flatten()]
    scores = neighbor_vectors.sum(axis=0)

    seen_items = test_matrix[user_idx].nonzero()[1]
    scores[0, seen_items] = 0

    recommended_items = np.argsort(scores.A1)[::-1][:5]

    relevant_items = test_data[test_data['user_index'] == user_idx]
    relevant_items = relevant_items[relevant_items['interaction'] > 0]['item_index'].values

    return ndcg_at_k(recommended_items, relevant_items, k=5)

In [None]:
from joblib import Parallel, delayed
# Run in parallel
results = Parallel(n_jobs=-1, backend='loky')(  # n_jobs=-1 uses all cores
    delayed(compute_ndcg)(user_idx) for user_idx in tqdm(test_user_indices)
)

# Filter out skipped users
ndcg_scores = [score for score in results if score is not None]

In [None]:
# check how many scores are not 0 or None
for score in ndcg_scores:
    if score != 0 and score is not None:
        print(score)
