In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import random as rnd

from scipy.sparse import *

In [4]:
urm_path = '../content/data_train.csv'
urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [5]:
urm_all_df.head(10)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
5,1,187,1.0
6,1,205,1.0
7,1,222,1.0
8,1,237,1.0
9,1,354,1.0


In [6]:
print ("The number of interactions is {}".format(len(urm_all_df)))

The number of interactions is 478730


In [7]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


# Remove empty profiles

In [8]:
# Remove empty IDs
mapped_id, original_id = pd.factorize(urm_all_df["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(urm_all_df["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)



# Let's create the mapping from items indices to original item_ids
item_index_to_original_ID = pd.Series(item_original_ID_to_index.index,index = item_original_ID_to_index.values)



original_item_ID = 125
print("New index for item {} is {}".format(original_item_ID, item_original_ID_to_index[original_item_ID]))


urm_all_df["UserID"] = urm_all_df["UserID"].map(user_original_ID_to_index)
urm_all_df["ItemID"] = urm_all_df["ItemID"].map(item_original_ID_to_index)
urm_all_df.head(n=10)

New index for item 125 is 93


Unnamed: 0,UserID,ItemID,Interaction
0,0,0,1.0
1,0,1,1.0
2,0,2,1.0
3,0,3,1.0
4,0,4,1.0
5,0,5,1.0
6,0,6,1.0
7,0,7,1.0
8,0,8,1.0
9,0,9,1.0


In [9]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22221, Max Id users	 12637

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


In [10]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [None]:
urm_all = urm_all.tocsr()
urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [11]:
train_test_split = 0.80

n_interactions = urm_all.nnz

train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])

urm_train = sps.csr_matrix((urm_all.data[train_mask],
                            (urm_all.row[train_mask], urm_all.col[train_mask])))

val_mask = np.logical_not(train_mask)

urm_val = sps.csr_matrix((urm_all.data[val_mask],
                            (urm_all.row[val_mask], urm_all.col[val_mask])))

# SLIM with Cython

# Preparing for training

In [12]:
urm_train

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 382970 stored elements in Compressed Sparse Row format>

In [13]:
train_n_interactions = urm_train.nnz
train_n_interactions

382970

In [14]:
n_users, n_items = urm_train.shape
print(f"The num of users is {n_users}")
print(f"The num of items is {n_items}")

The num of users is 12638
The num of items is 22222


In [11]:
%load_ext Cython

In [12]:
%%cython

import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_2_input,lr_epochs_patience,min_lr, n_epochs):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr
    cdef double[:] URM_train_data = URM_train.data

    cdef double[:,:] item_item_S = np.zeros((n_items, n_items), dtype = float)
    cdef double learning_rate = learning_rate_input
    cdef double regularization_2 = regularization_2_input
    cdef double min_learning_rate = min_lr
    cdef int lr_patience_thr = lr_epochs_patience
    cdef int lr_patience = 0
    cdef double loss
    cdef double best_loss
    cdef long start_time
    cdef double true_rating, predicted_rating, prediction_error, profile_rating
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, profile_item_id

    best_loss = 2.0
    lr_patience = 0
    loss = 2.0

    for n_epoch in range(n_epochs):

        if loss/(sample_num+1) < best_loss and n_epoch != 0:
          best_loss = loss/(sample_num+1)
          lr_patience = 0
        elif loss/(sample_num+1) > best_loss:
          lr_patience += 1
          print(f"LR patience is incremented and is {lr_patience}")

        if lr_patience > lr_patience_thr and learning_rate > min_learning_rate:
          print(f"LR gets reduced by 0.1")
          learning_rate *= 0.1
          lr_patience = 0


        loss = 0.0
        start_time = time.time()

        if n_epoch % 50 == 0:
          np.save("/content/item_item_similarity.npy",np.array(item_item_S))

        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions


            user_id = URM_train_coo_row[index]
            item_id = URM_train_coo_col[index]
            true_rating = URM_train_coo_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for idx in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[idx]
                profile_rating = URM_train_data[idx]
                predicted_rating += item_item_S[profile_item_id,item_id] * profile_rating

            # Compute prediction error, or gradient
            prediction_error = true_rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for idx in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[idx]
                profile_rating = URM_train_data[idx]
                item_item_S[profile_item_id,item_id] += learning_rate * (prediction_error * profile_rating -
                                                                         regularization_2 * item_item_S[profile_item_id,item_id])

            # Ensure diagonal is always zero
            item_item_S[item_id,item_id] = 0.0

#             if sample_num % 1000000 == 0:
#                 print("Epoch {}: {:.2f}%".format(n_epoch+1, sample_num/n_interactions*100))


        elapsed_time = time.time() - start_time
        samples_per_second = (sample_num+1)/elapsed_time

        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second))

    return np.array(item_item_S), loss/(sample_num+1), samples_per_second

In [13]:
n_items = urm_train.shape[1]
learning_rate = 1e-3
regularization_2 = 1
lr_patience = 5
min_learning_r = 1e-6
num_epochs = 200

item_item_S, loss, samples_per_second = train_multiple_epochs(urm_train, learning_rate, regularization_2,lr_epochs_patience = lr_patience,
                                                              min_lr = min_learning_r,n_epochs=num_epochs)

Epoch 1 complete in in 54.04 seconds, loss is 7.728E-01. Samples per second 7087.18
Epoch 2 complete in in 2.97 seconds, loss is 5.541E-01. Samples per second 128877.93
Epoch 3 complete in in 3.11 seconds, loss is 4.489E-01. Samples per second 123340.97
Epoch 4 complete in in 2.33 seconds, loss is 3.808E-01. Samples per second 164427.59
Epoch 5 complete in in 2.55 seconds, loss is 3.331E-01. Samples per second 150369.82
Epoch 6 complete in in 2.75 seconds, loss is 2.980E-01. Samples per second 139249.81
Epoch 7 complete in in 3.79 seconds, loss is 2.697E-01. Samples per second 100984.85
Epoch 8 complete in in 3.18 seconds, loss is 2.469E-01. Samples per second 120257.65
Epoch 9 complete in in 2.33 seconds, loss is 2.281E-01. Samples per second 164676.44
Epoch 10 complete in in 2.53 seconds, loss is 2.113E-01. Samples per second 151420.10
Epoch 11 complete in in 2.72 seconds, loss is 1.981E-01. Samples per second 140989.51
Epoch 12 complete in in 3.64 seconds, loss is 1.866E-01. Samples

In [14]:
item_item_S

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.06860921, ...,  0.        ,
         0.00478075,  0.        ],
       [ 0.        ,  0.07527323,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.00321752,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# Evaluate the trained model


In [21]:
def precision(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

def evaluate_algorithm(URM_test, recommender_object, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, at=at,exclude_seen=True)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    mean_precision = cumulative_precision / num_eval
    mean_recall = cumulative_recall / num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))
    return MAP, mean_precision, mean_recall

In [16]:
class SLIMRecommender(object):

    def __init__(self, URM,item_item_similarity):
        self.URM = URM
        self.W_sparse = sps.csr_matrix(item_item_similarity)


    def recommend(self, user_id, at=None, exclude_seen=True, users_not_in_train=[]):
        # Check if user_id is a valid index
        if user_id < 0 or user_id >= self.URM.shape[0]:
            print(f"Invalid user_id: {user_id}")
            return

        # Check if user_id not in train use the topRec
        if user_id in users_not_in_train:
            return ["517 189 44 0 284 808 285 1 557 1266"]

        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]


    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [17]:
recommender_slim = SLIMRecommender(urm_train,item_item_S)

In [18]:
map, mp,mr = evaluate_algorithm(urm_val,recommender_slim)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 352.5000, Recall = 473.7796, MAP = 0.0211
MAP@10 on val is 0.021069865160591048


# SLIM ElasticNet

In [None]:
!unzip /content/Recommenders.zip

In [None]:
!unzip /content/Utils.zip

In [15]:
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Recommenders.Similarity.Compute_Similarity_Python import Incremental_Similarity_Builder
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import sys
import time


class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available

    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.

        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)
        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        similarity_builder = Incremental_Similarity_Builder(self.n_items, initial_data_block=self.n_items*self.topK, dtype = np.float32)

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values
            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            # Check if there are more data points than topK, if so, extract the set of K best values
            if len(nonzero_model_coef_value) > self.topK:
                # Partition the data because this operation does not require to fully sort the data
                relevant_items_partition = np.argpartition(-np.abs(nonzero_model_coef_value), self.topK-1, axis=0)[0:self.topK]
                nonzero_model_coef_index = nonzero_model_coef_index[relevant_items_partition]
                nonzero_model_coef_value = nonzero_model_coef_value[relevant_items_partition]

            similarity_builder.add_data_lists(row_list_to_add=nonzero_model_coef_index,
                                              col_list_to_add=np.ones(len(nonzero_model_coef_index), dtype = int) * currentItem,
                                              data_list_to_add=nonzero_model_coef_value)


            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        self.W_sparse = similarity_builder.get_SparseMatrix()

In [57]:
slim_elastic_recommender = SLIMElasticNetRecommender(urm_train)
slim_elastic_recommender.fit(alpha = 0.0015746723778813712, l1_ratio = 0.005,topK = 100)

SLIMElasticNetRecommender: URM Detected 198 ( 1.6%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 117 ( 0.5%) items with no interactions.
SLIMElasticNetRecommender: Processed 8129 (36.6%) in 5.00 min. Items per second: 27.09
SLIMElasticNetRecommender: Processed 16534 (74.4%) in 10.00 min. Items per second: 27.55
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 13.57 min. Items per second: 27.30


In [58]:
usr_id = 3
slim_elastic_recommender.recommend(usr_id,cutoff = 10)

[589, 227, 46, 50, 678, 3708, 109, 1478, 818, 812]

For the BaseRecommenderClass i needed to slightly change the evalutation function

In [59]:
def evaluate_SLIMelasticnet(URM_test, recommender_object, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, cutoff=at,remove_seen_flag=True)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    mean_precision = cumulative_precision / num_eval
    mean_recall = cumulative_recall / num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))
    return MAP, mean_precision, mean_recall


map, mp,mr = evaluate_SLIMelasticnet(urm_val,slim_elastic_recommender)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 978.7000, Recall = 1362.0845, MAP = 0.0822
MAP@10 on val is 0.08221288820563075


# Predict for the test data

In [62]:
urm_path = '/content/data_target_users_test.csv'

urm_pred_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int},
                                engine='python')

urm_pred_df.columns = ["UserID"]
len(urm_pred_df['UserID'])

10882

In [60]:
Recommender = SLIMElasticNetRecommender(urm_all)
Recommender.fit(alpha = 0.0015746723778813712, l1_ratio = 0.005,topK = 100)

SLIMElasticNetRecommender: Processed 6703 (30.2%) in 5.00 min. Items per second: 22.34
SLIMElasticNetRecommender: Processed 14040 (63.2%) in 10.00 min. Items per second: 23.40
SLIMElasticNetRecommender: Processed 21570 (97.1%) in 15.00 min. Items per second: 23.96
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 15.42 min. Items per second: 24.02


**Do the predictions, but bear in mind that before feeding the user_id to the model you need to translate it to the model user indices space. Also, you have to translate the recommended item IDs into the original IDs space form the model item IDs space**

In [70]:
pred_df = pd.DataFrame(columns = ['user_id','item_list'])

for userid in urm_pred_df['UserID']:
  if userid in user_original_ID_to_index.keys():
    # Map user ID to the index of model user space
    usr_idx = user_original_ID_to_index[userid]
    recomendatoins = Recommender.recommend(usr_idx, cutoff=10,remove_seen_flag=True)
    # Map item indices from model item space to original item ID
    recomendatoins = [item_index_to_original_ID[idx] for idx in recomendatoins]
  else:
    recomendatoins = ["517 189 44 0 284 808 285 1 557 1266"]

  recomendatoins = " ".join(str(item) for item in recomendatoins)
  pred_df.loc[len(pred_df)] = [userid,recomendatoins]

In [71]:
pred_df

Unnamed: 0,user_id,item_list
0,1,101 123 36 506 694 515 52 1546 403 592
1,2,1095 47 12 50 656 1522 196 949 3176 131
2,3,59 857 4 2172 4252 1281 2748 648 536 259
3,4,249 28 50 139 314 146 171 128 254 136
4,5,1570 170 95 471 77 1511 5138 1220 131 826
...,...,...
10877,13020,6450 6198 6452 6749 7395 105 7394 1191 627 345
10878,13021,6179 133 6451 7027 6426 6720 13621 17942 7395 13
10879,13022,1668 1411 809 1446 1674 4688 8 118 33 143
10880,13023,1124 329 1290 706 1107 138 1534 208 96 1532


In [72]:
pred_df.to_csv('/content/predSLIM_elastic_Max.csv',index=False)