In [1]:
from numpy.ma import MaskedArray
import sklearn.utils.fixes
import warnings
warnings.filterwarnings("ignore")

sklearn.utils.fixes.MaskedArray = MaskedArray

import os
os.system(r"run_compile_all_cython.py")

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorization_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1969[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorization_Cython_Epoch.c:746[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorization_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_pf_32MatrixFactorization_Cython_Epoch_32MatrixFactorization_Cython_Epoch_10epochIteration_Cython_ASY_SVD_SGD[m[K’:
 8669 |         [01;35m[Kfor[m[K (__pyx_t_19 = __pyx_v_start_pos_seen_items; __pyx_t_19 < 

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

sklearn.utils.fixes.MaskedArray = MaskedArray

# Load the data
URM_path = 'Data/interactions_and_impressions.csv'
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path,
                                sep=",",
                                header=0, engine='python')
URM_all_dataframe.columns = ["UserID", "ItemID", "Impressions", "Data"]

print("The number of interactions is {}".format(len(URM_all_dataframe)))

userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)
print("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print("Average interactions per user {:.2f}".format(n_interactions / n_users))
print("Average interactions per item {:.2f}\n".format(n_interactions / n_items))

print("Sparsity {:.2f} %".format((1 - float(n_interactions) / (n_items * n_users)) * 100))

# Build the URM: I turn every kind of interaction as a 1, so I first eliminate all the duplicate and then turn every
# value remained in the Data of the iteractions_and_impressions into a 1
URM_all_dataframe = URM_all_dataframe.drop_duplicates(['UserID', 'ItemID'], keep='first')

URM_all = sps.coo_matrix((np.ones(len(URM_all_dataframe["Data"].values)),
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr()  # to obtain fast access to rows (users)

from Data_manager.split_functions.split_train_validation_random_holdout import \
    split_train_in_two_percentage_global_sample

# split data into train and validation data
URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.80)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage=0.80)

from Evaluation.Evaluator import EvaluatorHoldout

# create an evaluator object to evaluate validation set
# will use it for hyperparameter tuning
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

The number of interactions is 5826506
Number of items	 24507, Number of users	 41629
Max ID items	 24506, Max Id users	 41628

Average interactions per user 139.96
Average interactions per item 237.75

Sparsity 99.43 %
EvaluatorHoldout: Ignoring 768 ( 1.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 332 ( 0.8%) Users that have less than 1 test interactions


In [3]:
import numpy as np
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning


class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):
        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)
        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000
        
        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)
        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values
            
            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data
            
            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))
                    
                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))
                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)

In [4]:
recommender_SLIMElasticNet = SLIMElasticNetRecommender(URM_train)
recommender_SLIMElasticNet.fit(epochs = 700, l1_ratio=0.049999999999999996, alpha = 0.001, positive_only = True, topK = 1000)

SLIMElasticNetRecommender: Processed 6319 (25.8%) in 5.00 min. Items per second: 21.06
SLIMElasticNetRecommender: Processed 14083 (57.5%) in 10.00 min. Items per second: 23.47
SLIMElasticNetRecommender: Processed 22561 (92.1%) in 15.00 min. Items per second: 25.06
SLIMElasticNetRecommender: Processed 24507 (100.0%) in 16.17 min. Items per second: 25.25


In [5]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
recommender_RP3beta = RP3betaRecommender(URM_train)
recommender_RP3beta.fit(alpha = 0.6466715570981898, beta = 0.2703618471526261, topK = 80, implicit = True)

RP3betaRecommender: Similarity column 24507 (100.0%), 2291.17 column/sec. Elapsed time 10.70 sec


In [6]:
from Recommenders.EASE_R.EASE_R_Recommender import EASE_R_Recommender
recommender_EASE_R = EASE_R_Recommender(URM_train)
recommender_EASE_R.fit(topK = None, normalize_matrix = False, l2_norm = 93.68456224396647)

EASE_R_Recommender: Fitting model... 
EASE_R_Recommender: Fitting model... done in 10.51 min


In [7]:
from numpy import linalg as LA
from Recommenders.BaseRecommender import BaseRecommender

class DifferentLossScoresHybridRecommender(BaseRecommender):

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"


    def __init__(self, URM_train, recommender_1, recommender_2, recommender_3):
        super(DifferentLossScoresHybridRecommender, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2
        self.recommender_3 = recommender_3
        
        
        
    def fit(self, norm, alpha = 0.5, beta = 0.5):

        self.alpha = alpha
        self.beta = beta
        self.norm = norm


    def _compute_item_score(self, user_id_array, items_to_compute):
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)
        item_weights_3 = self.recommender_3._compute_item_score(user_id_array)

        norm_item_weights_1 = LA.norm(item_weights_1, self.norm)
        norm_item_weights_2 = LA.norm(item_weights_2, self.norm)
        norm_item_weights_3 = LA.norm(item_weights_3, self.norm)
        
        
        if norm_item_weights_1 == 0:
            raise ValueError("Norm {} of item weights for recommender 1 is zero. Avoiding division by zero".format(self.norm))
        
        if norm_item_weights_2 == 0:
            raise ValueError("Norm {} of item weights for recommender 2 is zero. Avoiding division by zero".format(self.norm))
            
        if norm_item_weights_3 == 0:
            raise ValueError("Norm {} of item weights for recommender 3 is zero. Avoiding division by zero".format(self.norm))
        
        item_weights = item_weights_1 / norm_item_weights_1 * self.alpha + item_weights_2 / norm_item_weights_2 * self.beta + item_weights_3 / norm_item_weights_3 * (1-self.alpha-self.beta)

        return item_weights

In [8]:
#find the best model
recommender_object = DifferentLossScoresHybridRecommender(URM_train, recommender_SLIMElasticNet, recommender_RP3beta, recommender_EASE_R)

best_model = {
    "MAP" : 0,
    "alpha" : 0,
    "beta" : 0,
    "norm" : 0
}

norm = 1
for alpha in np.arange(0.0, 1.1, 0.1):
    for beta in np.arange(0.0, 1.1, 0.1):

        #truncate digits since np.arange sometimes doesn't
        alpha = round(alpha,1)
        beta = round(beta,1)


        #discard cases in which the sum is greater than 1 
        if ( (alpha+beta) <= 1): 
            theta = round(1-alpha-beta,1)

            print("----")
            recommender_object.fit(norm, alpha, beta)
            result_df, _ = evaluator_validation.evaluateRecommender(recommender_object)
            print("Norm: {}, Alpha: {}, Beta: {}, Theta: {}, Result: {}".format(norm, alpha, beta, 1-alpha-beta, result_df.loc[10]["MAP"]))

        if result_df.loc[10]["MAP"] > best_model["MAP"]:
            best_model["MAP"] = result_df.loc[10]["MAP"]
            best_model["alpha"] = alpha
            best_model["beta"] = beta

print("----")
print("Best model has MAP: {} with alpha: {}, beta: {}".format(best_model["MAP"], best_model["alpha"], best_model["beta"]))

----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.40 min. Users per second: 488
Norm: 1, Alpha: 0.0, Beta: 0.0, Theta: 1.0, Result: 0.02141616001286537
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.31 min. Users per second: 520
Norm: 1, Alpha: 0.0, Beta: 0.1, Theta: 0.9, Result: 0.021979239139427977
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.31 min. Users per second: 520
Norm: 1, Alpha: 0.0, Beta: 0.2, Theta: 0.8, Result: 0.02232625377635229
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.31 min. Users per second: 518
Norm: 1, Alpha: 0.0, Beta: 0.3, Theta: 0.7, Result: 0.022438746070203304
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.32 min. Users per second: 516
Norm: 1, Alpha: 0.0, Beta: 0.4, Theta: 0.6, Result: 0.02246332611179229
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.33 min. Users per second: 512
Norm: 1, Alpha: 0.0, Beta: 0.5, Theta: 0.5, Result: 0.022409475329251482
----
EvaluatorHoldout: Processed 40861 (100.0%) in 1.33 min. Users 

In [9]:
recommender_SLIMElasticNet = SLIMElasticNetRecommender(URM_all)
recommender_SLIMElasticNet.fit(epochs = 700, l1_ratio=0.049999999999999996, alpha = 0.001, positive_only = True, topK = 1000)

from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
recommender_RP3beta = RP3betaRecommender(URM_all)
recommender_RP3beta.fit(alpha = 0.6466715570981898, beta = 0.2703618471526261, topK = 80, implicit = True)

from Recommenders.EASE_R.EASE_R_Recommender import EASE_R_Recommender
recommender_EASE_R = EASE_R_Recommender(URM_all)
recommender_EASE_R.fit(topK = None, normalize_matrix = False, l2_norm = 93.68456224396647)

# final hybrid recommender which combine the 3 models with the best norm, alpha and beta found
recommender = DifferentLossScoresHybridRecommender(URM_all, recommender_SLIMElasticNet, recommender_RP3beta, recommender_EASE_R)
recommender.fit(norm = 1, alpha = 0.6, beta = 0.2)

SLIMElasticNetRecommender: Processed 2632 (10.7%) in 5.00 min. Items per second: 8.77
SLIMElasticNetRecommender: Processed 5963 (24.3%) in 10.00 min. Items per second: 9.94
SLIMElasticNetRecommender: Processed 8942 (36.5%) in 15.00 min. Items per second: 9.93
SLIMElasticNetRecommender: Processed 12185 (49.7%) in 20.00 min. Items per second: 10.15
SLIMElasticNetRecommender: Processed 15558 (63.5%) in 25.00 min. Items per second: 10.37
SLIMElasticNetRecommender: Processed 19072 (77.8%) in 30.00 min. Items per second: 10.59
SLIMElasticNetRecommender: Processed 22570 (92.1%) in 35.00 min. Items per second: 10.75
SLIMElasticNetRecommender: Processed 24507 (100.0%) in 37.80 min. Items per second: 10.81
RP3betaRecommender: Similarity column 24507 (100.0%), 1877.99 column/sec. Elapsed time 13.05 sec
EASE_R_Recommender: Fitting model... 
EASE_R_Recommender: Fitting model... done in 11.30 min


In [10]:
test_users = pd.read_csv('Data/data_target_users_test.csv')

user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)