In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [None]:
# for reproduction
import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Generate train-test dataset
# Sort the impression data (ground truth) by timestamp. Set the first 80% of
# the impression to be training set.

# Note: to prevent data leakage, should set a cut-off timestamp for our data
# when spliting the train-test set.

In [None]:
DIR = "/content/drive/Shareddrives/SI650_Final_Project/ZhihuRec-1M"
TRAIN_RATIO = 0.8

In [None]:
impr = pd.read_csv(os.path.join(DIR, "impression.csv"), index_col=0)
impr["is_clicked"] = impr["click_ts"] != 0
impr = impr.sort_values("imp_ts")

In [None]:
y_train = impr.sort_values("imp_ts").iloc[:int(impr.shape[0]*TRAIN_RATIO), :]
y_test = impr.sort_values("imp_ts").iloc[int(impr.shape[0]*TRAIN_RATIO):, :]

In [None]:
ts_cutoff = y_train.iloc[-1, :].imp_ts

In [None]:
user = pd.read_csv(os.path.join(DIR, "user.csv"), index_col=0)

In [None]:
item = pd.read_csv(os.path.join(DIR, "answer.csv"), index_col=0)

In [None]:
impr.sort_values("imp_ts")

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked
588135,4678,1242,1525279527,0,False
588137,4678,8719,1525279533,0,False
588136,4678,2173,1525279533,1525279534,True
588139,4678,5131,1525279737,1525279784,True
588138,4678,61142,1525279737,1525279740,True
...,...,...,...,...,...
823291,6585,12660,1526144255,0,False
823292,6585,6325,1526144256,0,False
823293,6585,9707,1526144257,0,False
823295,6585,2656,1526144258,0,False


In [None]:
# impr_train = impr[impr.imp_ts < ts_cutoff]
# impr_test = impr[impr.imp_ts >= ts_cutoff]
train_size = impr[impr.imp_ts < ts_cutoff].shape[0]

## Pop

In [None]:
# K = 10
# pop_K = impr_train[impr_train.is_clicked].groupby("answerID").count().sort_values("is_clicked", ascending=False).index[:K].values

In [None]:
# impr_train[impr_train.is_clicked].groupby("answerID").count().sort_values("is_clicked", ascending=False).index[:K].values

array([  24, 1771,  664, 1827, 1083, 3564,   88,  797, 1789, 1777])

In [None]:
# from sklearn.metrics import ndcg_score

# # [n_user, K]
# pred_rel = np.array([[1] * K for _ in range(len(impr_test.userID.unique()))])
# true_rel = []
# for userid in impr_test.userID.unique():
#   t = impr_test[(impr_test.userID == userid) & (impr_test.is_clicked)].answerID
#   # shape = [1, K]
#   rel = np.isin(pop_K, t).astype(int)
#   true_rel.append(rel)

In [None]:
# ndcg_score(true_rel, pred_rel)

0.03353281305128657

In [None]:
# def HR(y_true, y_pred):
#   filt_y = np.sum(y_true, axis=1) > 0
#   return np.sum(filt_y) / len(y_true)

In [None]:
# HR(true_rel, pred_rel)

0.07126948775055679

## Baseline (surprise)

In [None]:
from surprise import BaselineOnly, Dataset, Reader
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.model_selection import cross_validate
from surprise import accuracy

reader = Reader(rating_scale=(0, 1))

data = impr[["userID", "answerID", "is_clicked"]]
data.columns = ["userID", "itemID", "rating"]
data = Dataset.load_from_df(data, reader)

raw_ratings = data.raw_ratings
A_raw_ratings = raw_ratings[:train_size]
B_raw_ratings = raw_ratings[train_size:]

# train_data = impr_train[["userID", "answerID", "is_clicked"]]
# train_data.is_clicked = train_data.is_clicked.astype(int)
# train_data.columns = ["userID", "itemID", "rating"]
# train_data = Dataset.load_from_df(train_data, reader)

# test_data = impr_test[["userID", "answerID", "is_clicked"]]
# test_data.is_clicked = test_data.is_clicked.astype(int)
# test_data.columns = ["userID", "itemID", "rating"]
# test_data = Dataset.load_from_df(test_data, reader)

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    from collections import defaultdict
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

# def dcg_at_k(r, k, method=1):
#     r = np.asfarray(r)[:k]
#     if r.size:
#         if method == 0:
#             return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
#         elif method == 1:
#             return np.sum(r / np.log2(np.arange(2, r.size + 2)))
#         else:
#             raise ValueError('method must be 0 or 1.')
#     return 0

# def ndcg_at_k(predictions, k=10, method=1):
#     from collections import defaultdict
#     user_est_true = defaultdict(list)
#     for uid, _, true_r, est, _ in predictions:
#         user_est_true[uid].append((est, true_r))
    
#     ndcg = dict()
#     for uid, user_ratings in user_est_true.items():
#         _, true_rel = zip(*user_est_true[uid])
#         dcg_max = dcg_at_k(sorted(true_rel, reverse=True), k, method)
#         if not dcg_max:
#             ndcg[uid] = 0
#         else:
#             ndcg[uid] = dcg_at_k(true_rel, k, method) / dcg_max
#     return ndcg

def ndcg_at_k(surprise_predictions, k=None):
    """ 
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse
  
    Parameters: 
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions
    k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. 
  
    Returns: 
    float in [0., 1.]: The averaged NDCG scores over all recommendations
  
    """
    from sklearn.metrics import ndcg_score
    from scipy import sparse
    
    uids = [int(p.uid) for p in surprise_predictions ]
    iids = [int(p.iid) for p in surprise_predictions ]
    r_uis = [p.r_ui for p in surprise_predictions ]
    ests = [p.est for p in surprise_predictions ]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests) )    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k)

def hr_at_k(predictions, k=10):
    from collections import defaultdict
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    n_users = len(user_est_true.keys())
    n_at_least_one_click = 0

    for uid, user_ratings in user_est_true.items():
        # take top K recommendation
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # (est, true_r)
        n_clicked = np.sum(tup[1] for tup in user_ratings[:k])
        if n_clicked > 0:
            n_at_least_one_click += 1
    
    return n_at_least_one_click / n_users

In [None]:
data.raw_ratings = A_raw_ratings  # data is now the set A

# train on the whole set A
trainset = data.build_full_trainset()

algo_normal = NormalPredictor()
algo_normal.fit(trainset)

# Compute biased accuracy on A
predictions = algo_normal.test(trainset.build_testset())
print("Biased accuracy on A,", end="   ")
accuracy.rmse(predictions)

# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions = algo_normal.test(testset)
print("Unbiased accuracy on B,", end=" ")
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.5527
Unbiased accuracy on B, RMSE: 0.5484


0.5484347355121982

In [None]:
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=.5)

# Precision and recall can then be averaged over all users
print(f"Precision@10: {sum(prec for prec in precisions.values()) / len(precisions)}")
print(f"Recall@10: {sum(rec for rec in recalls.values()) / len(recalls)}")

ndcg = ndcg_at_k(predictions, k=10)
print(f"NDCG@10: {ndcg}")

hr = hr_at_k(predictions, k=10)
print(f"HR@10: {hr}")

Precision@10: 0.2783160108883958
Recall@10: 0.1650021341835805


In [None]:
algo_baseline = BaselineOnly()
algo_baseline.fit(trainset)

# Compute biased accuracy on A
predictions = algo_baseline.test(trainset.build_testset())
print("Biased accuracy on A,", end="   ")
accuracy.rmse(predictions)

# Compute unbiased accuracy on B
predictions = algo_baseline.test(testset)
print("Unbiased accuracy on B,", end=" ")
accuracy.rmse(predictions)

Estimating biases using als...
Biased accuracy on A,   RMSE: 0.4107
Unbiased accuracy on B, RMSE: 0.4292


0.42922992751905176

In [None]:
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=.5)

# Precision and recall can then be averaged over all users
print(f"Precision@10: {sum(prec for prec in precisions.values()) / len(precisions)}")
print(f"Recall@10: {sum(rec for rec in recalls.values()) / len(recalls)}")

ndcg = ndcg_at_k(predictions, k=10)
print(f"NDCG@10: {ndcg}")

hr = hr_at_k(predictions, k=10)
print(f"HR@10: {hr}")

Precision@10: 0.18673374695089615
Recall@10: 0.046641171084694066
NDCG@10: 0.3821941235234947
HR@10: 0.9272457312546399




In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp
algo_svd = SVDpp(random_state=0, verbose=True)
algo_svd.fit(trainset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Biased accuracy on A,   RMSE: 0.4107
Unbiased accuracy on B, RMSE: 0.4292


0.42922992751905176

In [None]:
# Compute biased accuracy on A
predictions = algo_svd.test(trainset.build_testset())
print("Biased accuracy on A,", end="   ")
accuracy.rmse(predictions)

# Compute unbiased accuracy on B
predictions = algo_svd.test(testset)
print("Unbiased accuracy on B,", end=" ")
accuracy.rmse(predictions)

Biased accuracy on A,   RMSE: 0.3940
Unbiased accuracy on B, RMSE: 0.4331


0.43312757359088166

In [None]:
import surprise
surprise.dump.dump("/content/drive/Shareddrives/SI650_Final_Project/svd.pth", predictions=predictions, algo=algo_svd, verbose=0)

In [None]:
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=.5)

# Precision and recall can then be averaged over all users
print(f"Precision@10: {sum(prec for prec in precisions.values()) / len(precisions)}")
print(f"Recall@10: {sum(rec for rec in recalls.values()) / len(recalls)}")

ndcg = ndcg_at_k(predictions, k=10)
print(f"NDCG@10: {ndcg}")

hr = hr_at_k(predictions, k=10)
print(f"HR@10: {hr}")

Precision@10: 0.24092619106538954
Recall@10: 0.0705696906997781
NDCG@10: 0.3821941235234947
HR@10: 0.9324424647364514




In [None]:
from scipy.sparse import csr_matrix

row = impr_train.userID
col = impr_train.answerID
data = impr_train.is_clicked.astype(int)

user_item = csr_matrix((data, (row, col)), shape=(len(user.userID.unique()), len(item.answerID.unique())))

In [None]:
type(user_item)

scipy.sparse.csr.csr_matrix

In [None]:
# from implicit.nearest_neighbours import bm25_weight
# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
# user_item = bm25_weight(user_item, K1=100, B=0.8).tocsr()

In [None]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(user_item)

  f"CUDA extension is built, but disabling GPU support because of '{e}'",
  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
impr_test

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked
142,0,140,1526124926,1526124940,True
143,0,141,1526125012,1526125015,True
368,2,358,1526097085,0,False
369,2,359,1526097085,0,False
370,2,360,1526097134,1526097183,True
...,...,...,...,...,...
999337,7968,20785,1526140245,1526140251,True
999338,7968,5269,1526140266,0,False
999339,7968,26344,1526140266,0,False
999340,7968,29099,1526140266,0,False


In [None]:
def evaluate_rec(model, userids, impr_test, N):
  from tqdm.notebook import tqdm
  result = []
  for userid in tqdm(userids):
    ids, scores = model.recommend(userid, user_item[userid], N=N, filter_already_liked_items=True)
    recommend_result = pd.DataFrame(data={"score": scores, "answerID": ids})
    # get intersection
    impr_u = impr_test[impr_test.userID == userid]
    impr_u = impr_u.merge(recommend_result, how='left', left_on=['answerID'], right_on='answerID').sort_values("score", ascending=False)
    result.append(impr_u.values)
  
  result_columns = impr_test.columns
  result_columns.append("score")
  result = pd.DataFrame(result, columns=result_columns)
  return result

In [None]:
evaluate_rec(model, impr_test.userID, impr_test, 500)

  0%|          | 0/199995 [00:00<?, ?it/s]

TypeError: ignored