In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display, HTML, clear_output
display(HTML('<style>.container { width:80% !important; }</style>'))
display(HTML('<style>.prompt { min-width:10ex !important; }</style>'))
display(HTML('<style>div#notebook { font-size:12px !important; }</style>'))

from preprocessing import leave_last_out, transform_indices, preprocessing,\
reindex_data, generate_interactions_matrix, get_interaction_matrix
from datetime import datetime
from scipy.sparse import csr_matrix,diags
from scipy.sparse.linalg import svds
from evaluation import topn_recommendations

  from IPython.core.display import display, HTML, clear_output
  from IPython.core.display import display, HTML, clear_output


In [2]:
from src.baselines.SVD.SVD import SVD_baseline

In [3]:
topn = 20

In [4]:
user_df = pd.read_csv('./dataset/users_processed.csv')
item_df = pd.read_csv('./dataset/items_processed.csv')
interaction_df = pd.read_csv('./dataset/interactions_processed.csv', parse_dates=['last_watch_dt'])
#submission = pd.read_csv('sample_submission.csv')

In [5]:
def build_ssvd_model(config, data, data_description):
    source_matrix = generate_interactions_matrix(data, data_description, rebase_users = False)
    scaled_matrix, scaling_weights = rescale_matrix(source_matrix, config['scaling'])
    
    _, s, vt = svds(scaled_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, scaling_weights 

def rescale_matrix(matrix, scaling_factor):
    frequencies = matrix.getnnz(axis=0)
    scaling_weights = np.power(frequencies, 0.5*(scaling_factor-1))
    return matrix.dot(diags(scaling_weights)),scaling_weights

def ssvd_model_scoring(params, data, data_description):
    item_factors, scaling_weights = params
    
    test_matrix = generate_interactions_matrix(data, data_description, rebase_users = True)
    #scaling_weights = np.power(frequencies, 0.5*(scaling_factor-1))
    scores = test_matrix.dot(item_factors) @ item_factors.T
    downvote_seen_items(scores, data, data_description)
    return scores

In [6]:
def ohe(features : list, df, items = False):
    if items:
        ohe_df = df.item_id
    else:
        ohe_df = df.user_id
    for feat in features:
        ohe_feat_df = pd.get_dummies(df[feat], prefix = feat)
        ohe_df = pd.concat([ohe_df,ohe_feat_df],axis=1)
    return ohe_df

In [7]:
# One-hot encoding of cathegorical features
user_cat_features = ['age','income','sex','kids_flg']

user_ohe_df  =  ohe(user_cat_features,user_df) #.drop(columns='user_id')

item_cat_feats = ['content_type','release_year_cat','for_kids','age_rating','studios','countries','directors']
item_ohe_df = ohe(item_cat_feats,item_df, items = True) #.drop(columns='item_id')

In [8]:
train_val, data_description, train_matrix, train_matrix_indices, cold_users_matrix, cold_start_matrix_indices = \
preprocessing(interaction_df,user_ohe_df,item_ohe_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df['last_watch_dt_ts'] = interactions_df['last_watch_dt'].apply(lambda x: int(x.timestamp()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data[userid].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data['uid'].cat.codes
A value is tryin

In [9]:
ranks = [b*2**n for n in range(3,9) for b in [2,3]]
scalings = [0.2,0.4,0.6,0.8]

In [10]:
config = {'rank': 90,
         'scaling': 0.8}

In [11]:
SVD_ex = SVD_baseline(train_matrix,data_description, config)

In [12]:
holdout = data_description['holdout_standard'][data_description['holdout_standard'].user_id.isin(train_val.user_id)]

# Standard scenario

In [31]:
# grid search
configs = {}
for rank in ranks:
    for scaling in scalings:
        iter_config = {'rank':rank,
                      'scaling': scaling}
        SVD_ex = SVD_baseline(train_matrix,data_description, iter_config)
        hr, mrr, cov, ndcg = SVD_ex.svd_evaluate(holdout = holdout)
        
        config[(rank,scaling)] = {'hr':hr,
                                 'mrr':mrr,
                                 'cov':cov,
                                 'ndcg':ndcg}

0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
2000
4000
6000
7000
10000
0
1000
200

In [83]:
config_hr = {}
config_mrr = {}
config_cov = {}
config_ndcg = {}

In [84]:
for i in config.items():
    try:
#         config_f[i[0]] = {'hr':i[1]['hr'].mean(),
#                       'mrr':i[1]['mrr'].mean(),
#                       'cov':i[1]['cov'].mean(),
#                       'ndcg':i[1]['ndcg'].mean()
                     
#                      }
        config_mrr[i[0]] = i[1]['mrr'].mean()
        config_hr[i[0]] = i[1]['hr'].mean()
        config_cov[i[0]] = i[1]['cov'].mean()
        config_ndcg[i[0]] = i[1]['ndcg'].mean()
    except:
        pass

In [85]:
best_config = dict(
zip(['rank','scaling'],
   pd.Series(config_mrr).idxmax())
)



In [92]:
(
pd.Series(config_cov)
.unstack(level=1)
.style
.format('{:.4f}')
.background_gradient(high=0.2, axis = None)

)

Unnamed: 0,0.2,0.4,0.6,0.8
16,0.0274,0.0303,0.0232,0.0202
24,0.0495,0.0454,0.0308,0.0281
32,0.0629,0.0584,0.0372,0.0319
48,0.0942,0.0723,0.0463,0.0393
64,0.1249,0.0878,0.0536,0.0454
96,0.1683,0.1113,0.0685,0.0549
128,0.2072,0.1316,0.0797,0.0662
192,0.2677,0.1705,0.1031,0.0806
256,0.3081,0.1966,0.1187,0.0952
384,0.3666,0.2422,0.15,0.1207


In [89]:
(
pd.Series(config_mrr)
.unstack(level=1)
.style
.format('{:.4f}')
.background_gradient(high=0.2, axis = None)

)

Unnamed: 0,0.2,0.4,0.6,0.8
16,0.0635,0.048,0.035,0.0336
24,0.0565,0.0415,0.0297,0.0281
32,0.0533,0.0384,0.0277,0.0268
48,0.0495,0.0313,0.0249,0.0238
64,0.0457,0.027,0.0225,0.0217
96,0.0435,0.0227,0.0203,0.0199
128,0.0413,0.0214,0.019,0.019
192,0.034,0.019,0.0178,0.018
256,0.0304,0.0178,0.0163,0.0169
384,0.0262,0.0164,0.0152,0.0151


In [14]:
print('HR mean:',hr.mean())
print('MRR mean:',mrr.mean())
print('Coverage mean:',cov.mean())
print('NDCG mean:',ndcg.mean())

HR mean: 0.05479207567374621
MRR mean: 0.02005130235521103
Coverage mean: 0.0545479247333741
NDCG mean: 2.7914775049185005e-06


# Coldstart

In [13]:
holdout_cs = data_description['holdout_cs']

In [14]:
holdout_cs = holdout_cs[holdout_cs.user_id.isin(cold_start_matrix_indices['uid_to_user_id'].values())]

In [15]:
#user_factors, item_factors = SVD_ex.build_svd_model()

In [16]:
def user_evaluate(user_id, preds, holdout,data_description):
        n_items = data_description['n_items']
        holdout_user = holdout[holdout.user_id == user_id]
        predictions = preds[user_id]
        hits_mask = predictions == holdout_user.item_id.values
        # HR calculation
        hr = np.mean(hits_mask.any(axis=0))
        # MRR calculation
        n_test_users = preds.shape[0]
        hit_rank = np.where(hits_mask)[0] + 1.0

        mrr = np.sum(1 / hit_rank)
        # coverage calculation
        cov = np.unique(preds).size / n_items
        # ndcg
        # NDCG
        ndcg_per_user = 1.0 / np.log2(hit_rank + 1)
        ndcg = np.sum(ndcg_per_user) / n_test_users

        return hr, mrr, cov, ndcg

In [31]:
# grid search
config_hr = {}
config_mrr = {}
config_cov = {}
config_ndcg = {}

hr_full = []
mrr_full = []
cov_full = []
ndcg_full = []
for rank in ranks:
    for scaling in scalings:
        iter_config = {'rank':rank,
                      'scaling': scaling}
        SVD_ex = SVD_baseline(train_matrix,data_description, iter_config)
        user_factors, item_factors = SVD_ex.build_svd_model()
        cs_scores = cold_users_matrix[:20000,:].dot(item_factors) @ item_factors.T
        scores_topn = topn_recommendations(cs_scores[:20000,:],topn)
        
        for n,i in enumerate(holdout_cs.user_id.sort_values()):
            if n == 9000:
                break
            hr, mrr, cov, ndcg = user_evaluate(i,scores_topn, holdout_cs,data_description)
            hr_full.append(hr)
            mrr_full.append(mrr)
            cov_full.append(cov)
            ndcg_full.append(ndcg)
        
        config_ndcg[(rank,scaling)] = np.array(ndcg_full).mean()
        config_hr[(rank,scaling)] = np.array(hr).mean()
        config_mrr[(rank,scaling)] = np.array(mrr).mean()
        config_cov[(rank,scaling)] = np.array(cov).mean()
        
        print(config_mrr[(rank,scaling)],config_cov[(rank,scaling)])
#         config[(rank,scaling)] = {'hr':hr.mean(),
#                                  'mrr':mrr.mean(),
#                                  'cov':cov.mean(),
#                                  'ndcg':ndcg.mean()}

IndexError: index 20002 is out of bounds for axis 0 with size 20000

In [22]:
scores_topn = topn_recommendations(cs_scores[:20000,:],topn)

In [25]:
hr, mrr, cov, ndcg = user_evaluate(10,scores_topn, holdout_cs,data_description)

In [111]:
(
pd.Series(config_mrr)
.unstack(level=1)
.style
.format('{:.4f}')
.background_gradient(high=0.2, axis = None)

)

Unnamed: 0,0.2,0.4,0.6,0.8
16,0.0667,0.25,1.0,1.0
24,0.0,0.25,1.0,1.0
32,0.0,0.0,0.2,0.3333
48,0.0,1.0,0.2,0.125
64,0.0,0.25,0.1667,0.0
96,0.0,1.0,0.0,0.0
128,0.0,0.1667,0.0,0.0
192,0.0,0.0,0.0,0.0
256,0.0,0.0,0.0,0.0
384,0.0,0.0,0.0,0.0


In [29]:
hr = np.array(hr_full)
mrr = np.array(mrr_full)
cov = np.array(cov_full) 
ndcg = np.array(ndcg_full) 

In [30]:
print('HR mean:',hr.mean())
print('MRR mean:',mrr.mean())
print('Coverage mean:',cov.mean())
print('NDCG mean:',ndcg.mean())

HR mean: 0.03508616095221176
MRR mean: 0.007450015843163309
Coverage mean: 0.043950818558521836
NDCG mean: 6.685945875394102e-07
