In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [3]:
DATA_DIR = "./prepared_data"

In [4]:
train_data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [6]:
val_data_tr = pd.read_csv(os.path.join(DATA_DIR, 'validation_tr.csv'))
val_data_te = pd.read_csv(os.path.join(DATA_DIR, 'validation_te.csv'))

In [128]:
test_data_tr = pd.read_csv(os.path.join(DATA_DIR, 'test_tr.csv'))
test_data_te = pd.read_csv(os.path.join(DATA_DIR, 'test_te.csv'))

In [129]:
user_item = pd.concat([train_data, val_data_tr, test_data_tr])

In [130]:
# the user_item_matrix will look like this
# |        | item 1 | ... | item m |
# |--------|--------|-----|--------|
# | user 1 | 3      | 0   | 0      |
# | ...    | 0      | 4   | 5      |
# | user n | 2      | 0   | 0      |

users, items = user_item['uid'], user_item['sid']
unique_u, unique_i = pd.unique(users), pd.unique(items)
user_item_matrix = csr_matrix((np.ones_like(users), (users, items)), shape=(unique_u.shape[0], unique_i.shape[0]), dtype=int)

In [131]:
def train(n_latent_factors, epsilon=1e-9):
    # generate item lantent features
    item_svd = TruncatedSVD(n_components = n_latent_factors)
    item_features = item_svd.fit_transform(user_item_matrix.transpose()) + epsilon

    # generate user latent features
    user_svd = TruncatedSVD(n_components = n_latent_factors)
    user_features = user_svd.fit_transform(user_item_matrix) + epsilon

    return user_features, item_features

In [132]:
def predict(user_features, item_features, users, user_viewed_items, k=100):
    rating_u = np.mean(user_item_matrix) + np.dot(user_features[users], item_features.T)
    for i in range(users.shape[0]):
        rating_u[i][user_viewed_items[users[i]]] = 0
    top_k_i = np.argsort(-rating_u, axis=1)[:,:k]
    return top_k_i

In [133]:
def ndcg(true, pred, k=100):
        '''
        normalized discounted cumulative gain@k for binary relevance
        ASSUMPTIONS: all the 0's in true indicate 0 relevance
        '''
        # build the discount template
        tp = 1. / np.log2(np.arange(2, k + 2))
        DCG = (np.take_along_axis(true, pred, axis=1) * tp).sum(axis=1)
        IDCG = np.array([(tp[:min(int(n), k)]).sum()
                             for n in (true != 0).sum(axis=1)])
        return DCG / IDCG

In [134]:
unique_val_users = pd.unique(val_data_te['uid'])
val_user_items_to_remove = val_data_tr.groupby('uid')['sid'].apply(list)
val_users, val_items = val_data_te['uid'], val_data_te['sid']
val_user_item_matrix = csr_matrix((np.ones_like(val_users), (val_users, val_items)), shape=(unique_u.shape[0], unique_i.shape[0]), dtype=int)

In [135]:
best_feats = (None, None)
best_score = 0
best_n = 0
for n in [8, 16, 32, 64, 128, 256]:
    user_feats, item_feats = train(n)
    val_pred = predict(user_feats, item_feats, unique_val_users, val_user_items_to_remove)
    ndcg_score = ndcg(val_user_item_matrix[unique_val_users].toarray(), val_pred).mean()
    print(f"NDCG@100 for {n} latent factors is {ndcg_score}")
    if ndcg_score > best_score:
        best_score = ndcg_score
        best_feats = (user_feats, item_feats)
        best_n = n
print(f"Best NDCG@100 is {best_score} for {best_n} features")

NDCG@100 for 8 latent factors is 0.10507154822498767
NDCG@100 for 16 latent factors is 0.10201432434861857
NDCG@100 for 32 latent factors is 0.10406248834038938
NDCG@100 for 64 latent factors is 0.1052579980649808
NDCG@100 for 128 latent factors is 0.10436894471836142
NDCG@100 for 256 latent factors is 0.10620641444878592
Best NDCG@100 is 0.10620641444878592 for 256 features


In [136]:
unique_test_users = pd.unique(test_data_te['uid'])
test_user_items_to_remove = test_data_tr.groupby('uid')['sid'].apply(list)
test_users, test_items = test_data_te['uid'], test_data_te['sid']
test_user_item_matrix = csr_matrix((np.ones_like(test_users), (test_users, test_items)), shape=(unique_u.shape[0], unique_i.shape[0]), dtype=int)

In [137]:
test_pred = predict(*best_feats, unique_test_users, test_user_items_to_remove)
ndcg_score = ndcg(test_user_item_matrix[unique_test_users].toarray(), test_pred).mean()
print(ndcg_score)

0.10664642803962227
