In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter
import surprise
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
def precision_recall_at_k(predictions, k, threshold):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        precision = sum(prec for prec in precisions.values()) / len(precisions)

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        
        # Compute F-score
        f_score = (2*precision*recall)/(precision+recall)

    return precision, recall, f_score


def ndcg_at_k(predictions, k):
    dcgs = dict()
    idcgs = dict()
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
        
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        #estimated rank
        rank_est = np.arange(1, len(user_ratings[:k])+1)
        discount_est = np.log2(rank_est+1)
        
        #Relevance 
        rel = [np.power(2,true_r)-1 for (_, true_r) in user_ratings[:k]]
        
        dcgs[uid] = sum(rel/discount_est)
        
        # Sort user ratings by true value
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        #estimated rank
        rank_true = np.arange(1, len(user_ratings[:k])+1)
        discount_true = np.log2(rank_true+1)
        
        #Relevance 
        rel_true = [np.power(2,true_r)-1 for (_, true_r) in user_ratings[:k]]
        
        idcgs[uid] = sum(rel_true/discount_true)
        
    dcg = sum(dcgu for (_,dcgu) in dcgs.items())
    idcg = sum(idcgu for (_,idcgu) in idcgs.items())
    return dcg/idcg


def user_space_coverage(predictions, k, n_user, threshold):
	# First map the predictions to each user.
    user_est = defaultdict(list)
    for uid, _, _, est, _ in predictions:
        if est >= threshold:
            user_est[uid].append(est)
    n_user_k = sum((len(n_est) >= k ) for n_est in user_est.values())
    a = n_user_k/n_user
    return a

def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    return top_n

def item_space_coverage(predictions, k, n_items, threshold):
    top_n = get_top_n(predictions, k)
    items = []
    for uid, user_ratings in top_n.items():
        for (iid, rtg) in user_ratings:
            if rtg >= threshold:
                items.append(iid)
    
    return(len(set(items))/n_items)

# the items recommended to each user based on predictions
def recommendation_list(predictions, k, threshold):
    recom_list = defaultdict(list)
    top_n = get_top_n(predictions, k)
    for uid, user_ratings in top_n.items():
        for (iid, rating) in user_ratings:
            if rating >= threshold:
                recom_list[uid].append(iid)
    return recom_list

# the item popularity for each item
def item_popularity(ratings, n_users):
    item_pop = defaultdict(lambda: 0)
    for uid, iid, rtg in ratings: 
        item_pop[iid] +=1
    item_pop.update((iid, float(pop/n_users)) for iid, pop in item_pop.items())
    return item_pop

def novelty(predictions, k, item_pop, threshold):
    '''Return novelty metrics'''
    
    recom_list = recommendation_list(predictions, k, threshold)
  
    # novelties: the novelty metrics for each user
    novelties = dict()
    for uid, items in recom_list.items():
        # self_info: define novelty as the negative of the log of the item popularity
        self_info = 0
        for iid in items:
            if item_pop[iid] != 0:
                self_info += -math.log2(item_pop[iid])
        novelties[uid] = float(self_info/len(items)) 

    # compute novelty
    novelty = sum(nov for nov in novelties.values()) / len(novelties)

    return novelty

def primitive_list(ratings, k):
    # k must be an even number
    item_ratings = defaultdict(lambda: 0)
    item_pops = defaultdict(lambda: 0)
    for uid, iid, rtg in ratings:
        item_ratings[iid] += rtg 
        item_pops[iid] +=1
    item_ratings.update((iid, float(rtg/item_pops[iid])) for iid, rtg in item_ratings.items())
    
    primitive_list = []
    a = Counter(item_ratings)
    for iid, v in a.most_common(int(k)):
        primitive_list.append(iid)
    #b = Counter(item_pops)
    #for iid, v in b.most_common(int(k/2)):
    #    primitive_list.append(iid)
    return primitive_list

def serendipity(predictions, primitive_list, k, threshold):
    '''Return serendipity metrics'''
    recom_list = recommendation_list(predictions, k, threshold)
    
    # novelties: the serendipity metrics for each user
    serendipities = dict()
    for uid, items in recom_list.items():
        # unexpected: the number of unexpected items for each user
        n_unexpected = 0
        for iid in items:
            if iid not in primitive_list:
                n_unexpected += 1
        serendipities[uid] = float(n_unexpected/len(items))

    # compute serendipity
    serendipity = sum(ser for ser in serendipities.values()) / len(serendipities)

    return serendipity

In [3]:
# user based knn
def hybrid_switching_knn(predictions_knn, predictions_cb, threshold_knn, item_based = True):
    if item_based == True:
        df_knn_pred = pd.DataFrame(predictions_knn)
        # some ratings don't have actual estimated ratings because those items don't have enough neighbors('was_impossible':True, and algs gives average ratings instead)
        # so we only use knn to compute estimated ratings for those item that have enough neighbors 
        criterion_1 = df_knn_pred['details'].map(lambda x: x['was_impossible'] == False)
        criterion_2 = df_knn_pred['details'][criterion_1].map(lambda x: x['actual_k'] >= threshold_knn)
        criterion = criterion_1 & criterion_2
        sub_knn_pred = df_knn_pred[criterion]
        predictions = [tuple(x) for x in sub_knn_pred.values]

        iid_knn = sub_knn_pred['iid'].unique()
        predictions.extend([(uid, iid, r_ui, est, details) for uid, iid, r_ui, est, details in predictions_cb if iid not in iid_knn])
        return predictions
    elif item_based == False:    
        df_knn_pred = pd.DataFrame(predictions_knn)
        criterion = df_knn_pred['details'].map(lambda x: x['actual_k'] >= threshold_knn)
        sub_knn_pred = df_knn_pred[criterion]
        predictions = [tuple(x) for x in sub_knn_pred.values]

        uid_knn = sub_knn_pred['uid'].unique()
        predictions.extend([(uid, iid, r_ui, est, details) for uid, iid, r_ui, est, details in predictions_cb if uid not in uid_knn])
        return predictions
    
    else:
        print('Error: input for item_based')
        return 0 
    

In [4]:
def hybrid_switching_svd(predictions_svd, predictions_cb, threshold_svd):
    # use content-based model when the number of available ratings of a user falls below a fixed threshold
    # Ids that we apply Matrix Factorization method to recommend books
    uid_svd = pd.DataFrame(predictions_svd).groupby('uid').filter(lambda x: len(x) >= threshold_svd)['uid'].unique()
    predictions = [(uid, iid, r_ui, est, details) for uid, iid, r_ui, est, details in predictions_svd if uid in uid_svd]
    predictions.extend([(uid, iid, r_ui, est, details) for uid, iid, r_ui, est, details in predictions_cb if uid not in uid_svd])
    return predictions

In [5]:
def hybrid_mixed(predictions_cf, predictions_cb, n_cf, n_cb):
    # choose top n_cf estimated ratings from predictions_cf
    sub_cf_pred = pd.DataFrame(predictions_cf).groupby('uid').apply(lambda x: x.nlargest(n_cf, 'est')).reset_index(drop=True)
    # choose top n_cb estimated ratings from predictions_cb
    sub_cb_pred = pd.DataFrame(predictions_cb).groupby('uid').apply(lambda x: x.nlargest(n_cb, 'est')).reset_index(drop=True)
    predictions_df = sub_cf_pred.iloc[:,0:4].merge(sub_cb_pred.iloc[:,0:4], how='outer')
    predictions_df['details'] = 0
    predictions = [tuple(x) for x in predictions_df.values]
    return predictions
    # Note there might be problem with metrics since we only have n_cf+n_cb predictions for each user now.

In [6]:
def hybrid_weighted(predictions_cf, predictions_cb, alpha):
    predictions_df = pd.merge(pd.DataFrame(predictions_cf, columns = ['uid', 'iid', 'r_ui', 'est_cf', 'details_cf']), 
                              pd.DataFrame(predictions_cb, columns = ['uid', 'iid', 'r_ui', 'est_cb', 'details_cb']), 
                              how='inner', on=['uid', 'iid', 'r_ui'])
    predictions_df['est'] = alpha*predictions_df['est_cf'] + (1-alpha)*predictions_df['est_cb']
    predictions_df = predictions_df.drop(['est_cf', 'est_cb', 'details_cf', 'details_cb'], axis=1)
    predictions_df['details'] = 0
    predictions = [tuple(x) for x in predictions_df.values]
    return predictions

In [7]:
train = pd.read_csv('train.csv')
n_items_train = len(np.unique(train['ISBN']))
n_users_train = len(np.unique(train['User-ID']))

reader = surprise.Reader(rating_scale=(1, 10))
data = surprise.Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader)
trainset= data.build_full_trainset()
item_pop_train = item_popularity(trainset.build_testset(), n_users_train)
primitive_list_train = primitive_list(trainset.build_testset(), 200)

test = pd.read_csv('test.csv')
n_items_test = len(np.unique(test['ISBN']))
n_users_test = len(np.unique(test['User-ID']))
t = [tuple(x) for x in test[['User-ID', 'ISBN', 'Book-Rating']].values]
item_pop_test = item_popularity(t, n_users_test)
primitive_list_test = primitive_list(t, 50)

In [8]:
# baseline
algo_baseline = surprise.BaselineOnly()
# retrain on the whole train set
algo_baseline.train(trainset)

# Compute biased accuracy on train set
predictions_base_train= algo_baseline.test(trainset.build_testset())
precision_base_train, recall_base_train, f_base_train = precision_recall_at_k(predictions_base_train, k=10, threshold=7)
print ("\n Baseline Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_base_train, recall_base_train, f_base_train, 
    ndcg_at_k(predictions_base_train, 10),
    item_space_coverage(predictions_base_train, 10, n_items_train,7),
    user_space_coverage(predictions_base_train, 10, n_users_train,7),
    novelty(predictions_base_train, 10, item_pop_train, 7),
    serendipity(predictions_base_train, primitive_list_train, 10, 7)))

# Compute unbiased accuracy on test set
predictions_base_test = algo_baseline.test(t)
precision_base_test, recall_base_test, f_base_test = precision_recall_at_k(predictions_base_test, k=10, threshold=7)
print ("\n Baseline Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_base_test, recall_base_test, f_base_test, 
    ndcg_at_k(predictions_base_test, 10),
    item_space_coverage(predictions_base_test, 10, n_items_test,7), 
    user_space_coverage(predictions_base_test, 10, n_users_test,7),
    novelty(predictions_base_test, 10, item_pop_test, 7),
    serendipity(predictions_base_test, primitive_list_test, 10, 7)))


Estimating biases using als...

 Baseline Training Set:
 Precision:0.7564715005685758
 Recall:0.9801762822359228
 F-Score:0.8539157224469934
 NDCG:0.9501551494909518
 Item-space coverage:0.8200587084148728
 User-space coverage:0.06189063162356931
 Novelty:11.84878890502456
 Serendipity:0.9959239620900209


 Baseline Test Set:
 Precision:0.7567175847474973
 Recall:0.98956254621546
 F-Score:0.8576165606555032
 NDCG:0.9344864959697291
 Item-space coverage:0.9199113643558088
 User-space coverage:0.007220216606498195
 Novelty:10.677697484759817
 Serendipity:0.9943203071983051



In [9]:
# prdictions from content based model has been saved into predictions_cb_train/test.csv
predictions_cb_train= pd.read_csv('predictions_cb_train.csv')
predictions_cb_train.columns = ['uid', 'iid', 'r_ui', 'est', 'details']
predictions_cb_train = [x for x in predictions_cb_train.itertuples(index = False)]

predictions_cb_test= pd.read_csv('predictions_cb_test.csv')
predictions_cb_test.columns = ['uid', 'iid', 'r_ui', 'est', 'details']
predictions_cb_test = [x for x in predictions_cb_test.itertuples(index = False)]

In [10]:
precision_cb_train, recall_cb_train, f_cb_train = precision_recall_at_k(predictions_cb_train, k=10, threshold=7)
print ("\n Content Based on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_cb_train, recall_cb_train, f_cb_train, 
    ndcg_at_k(predictions_cb_train, 10),
    item_space_coverage(predictions_cb_train, 10, n_items_train,7),
    user_space_coverage(predictions_cb_train, 10, n_users_train,7),
    novelty(predictions_cb_train, 10, item_pop_train, 7),
    serendipity(predictions_cb_train, primitive_list_train, 10, 7)))

precision_cb_test, recall_cb_test, f_cb_test = precision_recall_at_k(predictions_cb_test, k=10, threshold=7)
print ("\n Content Based on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_cb_test, recall_cb_test, f_cb_test, 
    ndcg_at_k(predictions_cb_test, 10),
    item_space_coverage(predictions_cb_test, 10, n_items_test,7),
    user_space_coverage(predictions_cb_test, 10, n_users_test,7),
    novelty(predictions_cb_test, 10, item_pop_test, 7),
    serendipity(predictions_cb_test, primitive_list_test, 10, 7)))


 Content Based on Training Set:
 Precision:0.8681187212768389
 Recall:0.4802840602502648
 F-Score:0.6184258738504054
 NDCG:0.8398460490949666
 Item-space coverage:0.3311154598825832
 User-space coverage:0.0038151759220008477
 Novelty:12.117384069477904
 Serendipity:0.9921778711484593


 Content Based on Test Set:
 Precision:0.8411380436651194
 Recall:0.5642715270422853
 F-Score:0.6754333515935501
 NDCG:0.9074555518057316
 Item-space coverage:0.4491927825261159
 User-space coverage:0.0015471892728210418
 Novelty:10.840945843761158
 Serendipity:0.9940549225825299



In [11]:
#knn
sim_options = {'name': 'pearson',
               'user_based': False
               }
algo_knn = surprise.KNNBasic(k=5, sim_options=sim_options)
# retrain on the whole train set
algo_knn.train(trainset)

# Compute biased accuracy on train set
predictions_knn_train = algo_knn.test(trainset.build_testset())
precision_knn_train, recall_knn_train, f_knn_train = precision_recall_at_k(predictions_knn_train, k=10, threshold=7)
print ("\n KNN on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_train, recall_knn_train, f_knn_train, 
    ndcg_at_k(predictions_knn_train, 10),
    item_space_coverage(predictions_knn_train, 10, n_items_train,7),
    user_space_coverage(predictions_knn_train, 10, n_users_train,7),
    novelty(predictions_knn_train, 10, item_pop_train, 7),
    serendipity(predictions_knn_train, primitive_list_train, 10, 7)))

# Compute unbiased accuracy on test set
predictions_knn_test = algo_knn.test(t)
precision_knn_test, recall_knn_test, f_knn_test = precision_recall_at_k(predictions_knn_test, k=10, threshold=7)
print ("\n KNN on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_test, recall_knn_test, f_knn_test, 
    ndcg_at_k(predictions_knn_test, 10),
    item_space_coverage(predictions_knn_test, 10, n_items_test,7), 
    user_space_coverage(predictions_knn_test, 10, n_users_test,7),
    novelty(predictions_knn_test, 10, item_pop_test, 7),
    serendipity(predictions_knn_test, primitive_list_test, 10, 7)))

Computing the pearson similarity matrix...
Done computing similarity matrix.

 KNN on Training Set:
 Precision:0.999725637040177
 Recall:0.9848647121843231
 F-Score:0.9922395341402358
 NDCG:0.9990520972268696
 Item-space coverage:0.6845401174168297
 User-space coverage:0.04980924120389996
 Novelty:11.770722106683836
 Serendipity:0.9944185829602498


 KNN on Test Set:
 Precision:0.7469111471303325
 Recall:0.9977161491339762
 F-Score:0.8542859727757216
 NDCG:0.9181076071711439
 Item-space coverage:0.9487179487179487
 User-space coverage:0.008767405879319236
 Novelty:10.675132378262889
 Serendipity:0.9941755282136922



In [12]:
threshold_knn = 2

predictions_knn_hs_train = hybrid_switching_knn(predictions_knn_train, predictions_cb_train, threshold_knn, item_based = True)
precision_knn_hs_train, recall_knn_hs_train, f_knn_hs_train = precision_recall_at_k(predictions_knn_hs_train, k=10, threshold=7)
print ("\n HS KNN on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hs_train, recall_knn_hs_train, f_knn_hs_train, 
    ndcg_at_k(predictions_knn_hs_train, k=10),
    item_space_coverage(predictions_knn_hs_train, 10, n_items_train,7), 
    user_space_coverage(predictions_knn_hs_train, 10, n_users_train,7),
    novelty(predictions_knn_hs_train, 10, item_pop_train, 7),
    serendipity(predictions_knn_hs_train, primitive_list_train, 10, 7)))

predictions_knn_hs_test = hybrid_switching_knn(predictions_knn_test, predictions_cb_test, threshold_knn, item_based = True)
precision_knn_hs_test, recall_knn_hs_test, f_knn_hs_test = precision_recall_at_k(predictions_knn_hs_test, k=10, threshold=7)
print ("\n HS KNN on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hs_test, recall_knn_hs_test, f_knn_hs_test, 
    ndcg_at_k(predictions_knn_hs_test, 10),
    item_space_coverage(predictions_knn_hs_test, 10, n_items_test,7), 
    user_space_coverage(predictions_knn_hs_test, 10, n_users_test,7),
    novelty(predictions_knn_hs_test, 10, item_pop_test, 7),
    serendipity(predictions_knn_hs_test, primitive_list_test, 10, 7)))




 HS KNN on Training Set:
 Precision:0.8690834032177125
 Recall:0.4947138815160033
 F-Score:0.63051544181791
 NDCG:0.8533649385716962
 Item-space coverage:0.3519569471624266
 User-space coverage:0.00635862653666808
 Novelty:12.014299395848248
 Serendipity:0.9940647799482232


 HS KNN on Test Set:
 Precision:0.8411380436651194
 Recall:0.5642822714122353
 F-Score:0.6754410488572925
 NDCG:0.907610663855474
 Item-space coverage:0.4491927825261159
 User-space coverage:0.0015471892728210418
 Novelty:10.840945843761158
 Serendipity:0.9940549225825299



In [13]:
# Tunning on alpha
#alphas_knn = [0.2, 0.4, 0.6, 0.8]
#for alpha_knn in alphas_knn:
#    print ('\n alpha:{}'.format(alpha_knn))
alpha_knn = 0.5

predictions_knn_hw_train = hybrid_weighted(predictions_knn_train, predictions_cb_train, alpha_knn)
precision_knn_hw_train, recall_knn_hw_train, f_knn_hw_train = precision_recall_at_k(predictions_knn_hw_train, k=10, threshold=7)
print ("\n HW KNN on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hw_train, recall_knn_hw_train, f_knn_hw_train, 
    ndcg_at_k(predictions_knn_hw_train, k=10),
    item_space_coverage(predictions_knn_hw_train, 10, n_items_train,7), 
    user_space_coverage(predictions_knn_hw_train, 10, n_users_train,7),
    novelty(predictions_knn_hw_train, 10, item_pop_train, 7),
    serendipity(predictions_knn_hw_train, primitive_list_train, 10, 7)))

predictions_knn_hw_test = hybrid_weighted(predictions_knn_test, predictions_cb_test, alpha_knn)
precision_knn_hw_test, recall_knn_hw_test, f_knn_hw_test = precision_recall_at_k(predictions_knn_hw_test, k=10, threshold=7)
print ("\n HW KNN on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hw_test, recall_knn_hw_test, f_knn_hw_test, 
    ndcg_at_k(predictions_knn_hw_test, 10),
    item_space_coverage(predictions_knn_hw_test, 10, n_items_test,7), 
    user_space_coverage(predictions_knn_hw_test, 10, n_users_test,7),
    novelty(predictions_knn_hw_test, 10, item_pop_test, 7),
    serendipity(predictions_knn_hw_test, primitive_list_test, 10, 7)))




 HW KNN on Training Set:
 Precision:0.9222149888639924
 Recall:0.6170098819819179
 F-Score:0.7393536476943934
 NDCG:0.8943515670097424
 Item-space coverage:0.43228962818003913
 User-space coverage:0.0072064434082238235
 Novelty:12.094779824399536
 Serendipity:0.9895595560261993


 HW KNN on Test Set:
 Precision:0.818009610583102
 Recall:0.6278214133487472
 F-Score:0.710406598486918
 NDCG:0.907505276522854
 Item-space coverage:0.5254827477049699
 User-space coverage:0.0020629190304280558
 Novelty:10.831169571028939
 Serendipity:0.9953944020356235



In [14]:
n_knn = 10
n_cb = 10

predictions_knn_hm_train = hybrid_mixed(predictions_knn_train, predictions_cb_train, n_knn, n_cb)
precision_knn_hm_train, recall_knn_hm_train, f_knn_hm_train = precision_recall_at_k(predictions_knn_hm_train, k=10, threshold=7)
print ("\n HM KNN on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hm_train, recall_knn_hm_train, f_knn_hm_train, 
    ndcg_at_k(predictions_knn_hm_train, 10), 
    item_space_coverage(predictions_knn_hm_train, 10, n_items_train,7), 
    user_space_coverage(predictions_knn_hm_train, 10, n_users_train,7),
    novelty(predictions_knn_hm_train, 10, item_pop_train, 7),
    serendipity(predictions_knn_hm_train, primitive_list_train, 10, 7)))

predictions_knn_hm_test = hybrid_mixed(predictions_knn_test, predictions_cb_test, n_knn, n_cb)
precision_knn_hm_test, recall_knn_hm_test, f_knn_hm_test = precision_recall_at_k(predictions_knn_hm_test, k=10, threshold=7)
print ("\n HM KNN on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_knn_hm_test, recall_knn_hm_test, f_knn_hm_test, 
    ndcg_at_k(predictions_knn_hm_test, 10), 
    item_space_coverage(predictions_knn_hm_test, 10, n_items_test,7), 
    user_space_coverage(predictions_knn_hm_test, 10, n_users_test,7),
    novelty(predictions_knn_hm_test, 10, item_pop_test, 7),
    serendipity(predictions_knn_hm_test, primitive_list_test, 10, 7)))



 HM KNN on Training Set:
 Precision:0.8941008599285417
 Recall:0.7353715391945402
 F-Score:0.8070051704031961
 NDCG:0.8939839049790624
 Item-space coverage:0.7562622309197652
 User-space coverage:0.06782534972445951
 Novelty:11.851056722625783
 Serendipity:0.9948425084319445


 HM KNN on Test Set:
 Precision:0.746691348019352
 Recall:0.7797254728684226
 F-Score:0.7628509545414067
 NDCG:0.8991725008366419
 Item-space coverage:0.9392212725546059
 User-space coverage:0.024239298607529654
 Novelty:10.684740657283989
 Serendipity:0.9945956842424094



In [15]:
#SVD
algo_svd = surprise.SVD(n_factors = 10, lr_all= 0.001, reg_all =1)
# retrain on the whole train set
algo_svd.train(trainset)

# Compute biased accuracy on train set
predictions_svd_train = algo_svd.test(trainset.build_testset())
precision_svd_train, recall_svd_train, f_svd_train = precision_recall_at_k(predictions_svd_train, k=10, threshold=7)
print ("\n SVD on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_train, recall_svd_train, f_svd_train, 
    ndcg_at_k(predictions_svd_train, 10),
    item_space_coverage(predictions_svd_train, 10, n_items_train,7),
    user_space_coverage(predictions_svd_train, 10, n_users_train,7),
    novelty(predictions_svd_train, 10, item_pop_train, 7),
    serendipity(predictions_svd_train, primitive_list_train, 10, 7)))

# Compute unbiased accuracy on test set
predictions_svd_test = algo_svd.test(t)
precision_svd_test, recall_svd_test, f_svd_test = precision_recall_at_k(predictions_svd_test, k=10, threshold=7)
print ("\n SVD on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_test, recall_svd_test, f_svd_test, 
    ndcg_at_k(predictions_svd_test, 10),
    item_space_coverage(predictions_svd_test, 10, n_items_test,7),
    user_space_coverage(predictions_svd_test, 10, n_users_test,7),
    novelty(predictions_svd_test, 10, item_pop_test, 7),
    serendipity(predictions_svd_test, primitive_list_test, 10, 7)))



 SVD on Training Set:
 Precision:0.7406566879966627
 Recall:0.983723350513273
 F-Score:0.8450588181544213
 NDCG:0.9396202946133264
 Item-space coverage:0.8423679060665362
 User-space coverage:0.06718948707079271
 Novelty:11.850614597497001
 Serendipity:0.9962721468287626


 SVD on Test Set:
 Precision:0.7479426066455465
 Recall:0.9969018509584953
 F-Score:0.8546611312271732
 NDCG:0.9316886810628824
 Item-space coverage:0.9414371636593859
 User-space coverage:0.007735946364105209
 Novelty:10.674805360622406
 Serendipity:0.994376019208234



In [16]:
threshold_svd = 5

predictions_svd_hs_train = hybrid_switching_svd(predictions_svd_train, predictions_cb_train, threshold_svd)
precision_svd_hs_train, recall_svd_hs_train, f_svd_hs_train= precision_recall_at_k(predictions_svd_hs_train, k=10, threshold=7)
print ("\n HS SVD on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hs_train, recall_svd_hs_train, f_svd_hs_train, 
    ndcg_at_k(predictions_svd_hs_train, 10), 
    item_space_coverage(predictions_svd_hs_train, 10, n_items_train,7), 
    user_space_coverage(predictions_svd_hs_train, 10, n_users_train,7),
    novelty(predictions_svd_hs_train, 10, item_pop_train, 7),
    serendipity(predictions_svd_hs_train, primitive_list_train, 10, 7)))

predictions_svd_hs_test = hybrid_switching_svd(predictions_svd_test, predictions_cb_test, threshold_svd)
precision_svd_hs_test, recall_svd_hs_test, f_svd_hs_test = precision_recall_at_k(predictions_svd_hs_test, k=10, threshold=7)
print ("\n HS SVD on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hs_test, recall_svd_hs_test, f_svd_hs_test, 
    ndcg_at_k(predictions_svd_hs_test, 10), 
    item_space_coverage(predictions_svd_hs_test, 10, n_items_test,7), 
    user_space_coverage(predictions_svd_hs_test, 10, n_users_test,7),
    novelty(predictions_svd_hs_test, 10, item_pop_test, 7),
    serendipity(predictions_svd_hs_test, primitive_list_test, 10, 7)))


 HS SVD on Training Set:
 Precision:0.8676349273636257
 Recall:0.6124502121778201
 F-Score:0.7180440924112936
 NDCG:0.9277473849231519
 Item-space coverage:0.5935420743639922
 User-space coverage:0.06718948707079271
 Novelty:11.759606538468821
 Serendipity:0.9937816877202029


 HS SVD on Test Set:
 Precision:0.8410748053734128
 Recall:0.6018528566315228
 F-Score:0.7016336128055448
 NDCG:0.9174858216387138
 Item-space coverage:0.5837290281734726
 User-space coverage:0.007735946364105209
 Novelty:10.813965303749805
 Serendipity:0.993298365891601



In [17]:
alpha_svd = 0.5

predictions_svd_hw_train = hybrid_weighted(predictions_svd_train, predictions_cb_train, alpha_svd)
precision_svd_hw_train, recall_svd_hw_train, f_svd_hw_train= precision_recall_at_k(predictions_svd_hw_train, k=10, threshold=7)
print ("\n HW SVD on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hw_train, recall_svd_hw_train, f_svd_hw_train, 
    ndcg_at_k(predictions_svd_hw_train, 10), 
    item_space_coverage(predictions_svd_hw_train, 10, n_items_train,7), 
    user_space_coverage(predictions_svd_hw_train, 10, n_users_train,7),
    novelty(predictions_svd_hw_train, 10, item_pop_train, 7),
    serendipity(predictions_svd_hw_train, primitive_list_train, 10, 7)))

predictions_svd_hw_test = hybrid_weighted(predictions_svd_test, predictions_cb_test, alpha_svd)
precision_svd_hw_test, recall_svd_hw_test, f_svd_hw_test = precision_recall_at_k(predictions_svd_hw_test, k=10, threshold=7)
print ("\n HW SVD on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hw_test, recall_svd_hw_test, f_svd_hw_test, 
    ndcg_at_k(predictions_svd_hw_test, 10), 
    item_space_coverage(predictions_svd_hw_test, 10, n_items_test,7), 
    user_space_coverage(predictions_svd_hw_test, 10, n_users_test,7),
    novelty(predictions_svd_hw_test, 10, item_pop_test, 7),
    serendipity(predictions_svd_hw_test, primitive_list_test, 10, 7)))


 HW SVD on Training Set:
 Precision:0.8460363720166602
 Recall:0.5507498869051338
 F-Score:0.6671807275158684
 NDCG:0.842561781798418
 Item-space coverage:0.40821917808219177
 User-space coverage:0.006146672318779143
 Novelty:12.115399648619691
 Serendipity:0.9921132357570034


 HW SVD on Test Set:
 Precision:0.8178708547197457
 Recall:0.6265320889547296
 F-Score:0.7095282342739908
 NDCG:0.90868545844522
 Item-space coverage:0.5245330800886356
 User-space coverage:0.0020629190304280558
 Novelty:10.829375558201301
 Serendipity:0.9954185809086269



In [18]:
n_svd = 10
n_cb = 10

predictions_svd_hm_train = hybrid_mixed(predictions_svd_train, predictions_cb_train, n_svd, n_cb)
precision_svd_hm_train, recall_svd_hm_train, f_svd_hm_train = precision_recall_at_k(predictions_svd_hm_train, k=10, threshold=7)
print ("\n HM SVD on Training Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hm_train, recall_svd_hm_train, f_svd_hm_train, 
    ndcg_at_k(predictions_svd_hm_train, 10),
    item_space_coverage(predictions_svd_hm_train, 10, n_items_train,7),
    user_space_coverage(predictions_svd_hm_train, 10, n_users_train,7),
    novelty(predictions_svd_hm_train, 10, item_pop_train, 7),
    serendipity(predictions_svd_hm_train, primitive_list_train, 10, 7)))

predictions_svd_hm_test = hybrid_mixed(predictions_svd_test, predictions_cb_test, n_svd, n_cb)
precision_svd_hm_test, recall_svd_hm_test, f_svd_hm_test = precision_recall_at_k(predictions_svd_hm_test, k=10, threshold=7)
print ("\n HM SVD on Test Set:\n Precision:{}\n Recall:{}\n F-Score:{}\n NDCG:{}\n Item-space coverage:{}\n User-space coverage:{}\n Novelty:{}\n Serendipity:{}\n".format(precision_svd_hm_test, recall_svd_hm_test, f_svd_hm_test, 
    ndcg_at_k(predictions_svd_hm_test, 10),
    item_space_coverage(predictions_svd_hm_test, 10, n_items_test,7), 
    user_space_coverage(predictions_svd_hm_test, 10, n_users_test,7),
    novelty(predictions_svd_hm_test, 10, item_pop_test, 7),
    serendipity(predictions_svd_hm_test, primitive_list_test, 10, 7)))


 HM SVD on Training Set:
 Precision:0.7392111938741871
 Recall:0.7350142711061448
 F-Score:0.7371067584512081
 NDCG:0.8523596050344521
 Item-space coverage:0.8495107632093933
 User-space coverage:0.08986858838490885
 Novelty:11.881162758033005
 Serendipity:0.9959211429378879


 HM SVD on Test Set:
 Precision:0.7467601119870328
 Recall:0.779774684907214
 F-Score:0.7629103931475
 NDCG:0.9047600886328183
 Item-space coverage:0.9398543842988287
 User-space coverage:0.02372356884992264
 Novelty:10.685523351542422
 Serendipity:0.9945956842424094

