In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

#cimport numpy as np # noqa
import numpy as np

from surprise import Reader, AlgoBase, PredictionImpossible
from surprise import Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.utils import get_rng
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import os
import time

import math
import implicit

from tqdm import tqdm
from itertools import product

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, precision_score, recall_score
from math import sqrt
import scipy.sparse as sparse
from surprise import CoSVD, NMF, SVDpp

from joblib import Parallel
from joblib import delayed
from surprise.model_selection.validation import fit_and_score, print_summary

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')
#import matrices_generation as mg
#%reload_ext Cython
%load_ext Cython

In [2]:
def convert_time(seconds): 
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds) 

In [17]:
#data_source = 'ml-latest-small' # 100k MovieLens dataset 2016
#data_source = 'mlsmall' # 100k MovieLens dataset 2018
data_source = 'ml-10M100K' # 10M MovieLens dataset

reader = Reader()
path = os.path.join('../','Data',data_source)

rate = pd.read_csv(path+'/ratings.csv', encoding='utf-8')
data = Dataset.load_from_df(rate[['userId', 'movieId', 'rating']], reader)
raw_tags = pd.read_csv(path+'/tags.csv', encoding='utf-8')

In [18]:
full_trainset = data.build_full_trainset()

In [44]:
# 510 365 382 322 988 98 742 17 595 106
rs = 742
percentile_threshold = 99.5
n_factors = 32
alpha = 40

In [45]:
### Rated + Item Average
#temp_data = rate.groupby('movieId')['userId'].apply(lambda s: s.sample(100, random_state=rs) if len(s) > 100 else s).reset_index()
temp_data = rate.groupby('userId')['movieId'].apply(lambda s: s.sample(10, random_state=rs)).reset_index()
temp = rate[rate.index.isin(temp_data.level_1)]
temp['user_id'] = temp['userId'].astype("category").cat.codes
temp['item_id'] = temp['movieId'].astype("category").cat.codes
temp['rate'] = 1

users = list(np.sort(temp.user_id.unique()))
items = list(np.sort(temp.item_id.unique()))
rated = list(temp.rate)
cols = temp.user_id.astype(int)
rows = temp.item_id.astype(int)

#sparsity = len(temp) / (len(items)*len(users))
sparsity = alpha

data_sparse = sparse.csr_matrix((rated, (rows, cols)), shape=(len(items), len(users)))

### iALS model train and score
model = implicit.als.AlternatingLeastSquares(factors=n_factors, random_state=123)

model.fit((data_sparse*sparsity).astype('double'))
user_vecs = model.user_factors
item_vecs = model.item_factors
implicit_feedback = user_vecs.dot(item_vecs.T)

### Retrieve Implicit User Item pairs
result = np.where(implicit_feedback >= np.percentile(implicit_feedback, percentile_threshold))
rating_pairs = list(temp[['user_id', 'item_id']].apply(tuple, axis=1))
implicit_pairs = list(zip(result[0], result[1]))
uipairs = list(set(implicit_pairs)-set(rating_pairs))

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [46]:
weightage = 1.25

#user_avg_rate = temp[['movieId', 'item_id', 'rating']].groupby(['movieId', 'item_id']).mean().reset_index()
user_avg_rate = temp[['userId', 'user_id', 'rating']].groupby(['userId', 'user_id']).mean().reset_index()
user_avg_rate['rating'] = user_avg_rate['rating'].map(lambda x: 5 if x * weightage > 5 else x * weightage).tolist()

pairs = pd.DataFrame(uipairs, columns=["user_id", "item_id"])
#pairs= pd.merge(pairs, user_avg_rate, how='left', on='item_id')
pairs= pd.merge(pairs, user_avg_rate, how='left', on='user_id')
#pairs = pd.merge(pairs, temp[['userId', 'user_id']].drop_duplicates(), how='left', on='user_id')
pairs = pd.merge(pairs, temp[['movieId', 'item_id']].drop_duplicates(), how='left', on='item_id')
pairs = pairs[['userId', 'movieId', 'rating']]
pairs['timestamp'] = None
extra_rate = [tuple(x) for x in pairs.values]

temp2 = np.arange(len(data.raw_ratings))
temp2 = np.delete(temp2, temp_data.level_1)

raw_trainset = [data.raw_ratings[i] for i in temp_data.level_1]
raw_testset = [data.raw_ratings[i] for i in temp2]

trainset = data.construct_trainset(raw_trainset + extra_rate)
testset = data.construct_testset(raw_testset)

In [47]:
start = time.time()

algo = SVDpp(verbose=False, n_epochs=45, lr_all=0.0012, reg_all=0.0012, n_factors=20, random_state=123)
algo.fit(trainset)
predictions = algo.test(testset)

print("SVDpp User Average " + str(rs) + " " + str(weightage))

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
prec_5 = accuracy.prec_5(predictions, full_trainset)
rec_5 = accuracy.rec_5(predictions, full_trainset)
ndcg_5 = accuracy.ndcg_5(predictions)

print(convert_time(time.time() - start))

SVDpp User Average 742 1.25
RMSE: 1.0553
MAE:  0.8078
Precision@K:  0.6502
Recall@K:  0.1508
nDCG@5:  0.6339
0:53:34


In [48]:
start = time.time()

algo = SVD(verbose=False, n_epochs=60, lr_all=0.008, reg_all=0.091, n_factors=30, random_state=123)
algo.fit(trainset)
predictions = algo.test(testset)

print("SVD User Average " + str(rs) + " " + str(weightage))

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
prec_5 = accuracy.prec_5(predictions, full_trainset)
rec_5 = accuracy.rec_5(predictions, full_trainset)
ndcg_5 = accuracy.ndcg_5(predictions)

print(convert_time(time.time() - start))

SVD User Average 742 1.25
RMSE: 1.0766
MAE:  0.8256
Precision@K:  0.6906
Recall@K:  0.1607
nDCG@5:  0.6805
0:04:43


In [49]:
start = time.time()

algo = CoSVD(verbose=False, n_epochs=65, lr_all=0.0028, n_factors=40, tags=raw_tags, random_state=123)
algo.fit(trainset)
predictions = algo.test(testset)

print("CoSVD User Average " + str(rs) + " " + str(weightage))

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
prec_5 = accuracy.prec_5(predictions, full_trainset)
rec_5 = accuracy.rec_5(predictions, full_trainset)
ndcg_5 = accuracy.ndcg_5(predictions)

print(convert_time(time.time() - start))

CoSVD User Average 742 1.25
RMSE: 1.0738
MAE:  0.8234
Precision@K:  0.6907
Recall@K:  0.1597
nDCG@5:  0.6888
5:25:30
