In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

from scipy.sparse import *

# P3Alpha

In [None]:
urm_path = '../content/data_train.csv'
urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [None]:
urm_all_df.head(10)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
5,1,187,1.0
6,1,205,1.0
7,1,222,1.0
8,1,237,1.0
9,1,354,1.0


In [None]:
print ("The number of interactions is {}".format(len(urm_all_df)))

The number of interactions is 478730


In [None]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


In [None]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [None]:
urm_all = urm_all.tocsr()
urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

# Some useful import for the algorithm

In [None]:
import scipy.sparse as sps

from sklearn.preprocessing import normalize
from Recommenders.Recommender_utils import check_matrix, similarityMatrixTopK
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit


from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Recommenders.Similarity.Compute_Similarity_Python import Incremental_Similarity_Builder
import time, sys


In [None]:
def precision(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

def evaluate_algorithm(URM_test, recommender_object, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    mean_precision = cumulative_precision / num_eval
    mean_recall = cumulative_recall / num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))
    return MAP, mean_precision, mean_recall

# Algorithm

## Class for P3alpha

In [None]:
!unzip /content/Recommenders.zip
!unzip /content/Utils.zip

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Cesare Bernardis
"""

# import scipy.sparse as sps

# from sklearn.preprocessing import normalize
# from Recommenders.Recommender_utils import check_matrix, similarityMatrixTopK
# from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit

# from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
# from Recommenders.Similarity.Compute_Similarity_Python import Incremental_Similarity_Builder
# import time, sys




class P3alphaRecommender(BaseItemSimilarityMatrixRecommender):
    """ P3alpha recommender """

    RECOMMENDER_NAME = "P3alphaRecommender"

    def __init__(self, URM_train, verbose = True):
        super(P3alphaRecommender, self).__init__(URM_train, verbose = verbose)


    def __str__(self):
        return "P3alpha(alpha={}, min_rating={}, topk={}, implicit={}, normalize_similarity={})".format(self.alpha,
                                                                            self.min_rating, self.topK, self.implicit,
                                                                            self.normalize_similarity)


    def fit(self, topK=100, alpha=1., min_rating=0, implicit=False, normalize_similarity=False):

        self.topK = topK
        self.alpha = alpha
        self.min_rating = min_rating
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity


        # if X.dtype != np.float32:
        #     print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        # In our case this shouldn't apply: we're using IMPLICIT RATINGS
        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32)

        #Pui is the row-normalized urm
        Pui = normalize(self.URM_train, norm='l1', axis=1)
        self.Pui = Pui

        #Piu is the column-normalized, "boolean" urm transposed
        X_bool = self.URM_train.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)
        #ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del(X_bool)
        self.Piu = Piu

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        similarity_builder = Incremental_Similarity_Builder(Pui.shape[1], initial_data_block=Pui.shape[1]*self.topK, dtype = np.float32)

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = similarity_block[row_in_block, :]
                row_data[current_block_start_row + row_in_block] = 0

                relevant_items_partition = np.argpartition(-row_data, self.topK-1, axis=0)[:self.topK]
                row_data = row_data[relevant_items_partition]

                # Incrementally build sparse matrix, do not add zeros
                if np.any(row_data == 0.0):
                    non_zero_mask = row_data != 0.0
                    relevant_items_partition = relevant_items_partition[non_zero_mask]
                    row_data = row_data[non_zero_mask]

                similarity_builder.add_data_lists(row_list_to_add=np.ones(len(row_data), dtype = int) * (current_block_start_row + row_in_block), col_list_to_add=relevant_items_partition, data_list_to_add=row_data)


            if time.time() - start_time_printBatch > 300 or current_block_start_row + block_dim == Pui.shape[1]:
                new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)

                self._print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format(
                    current_block_start_row + block_dim,
                    100.0 * float( current_block_start_row + block_dim) / Pui.shape[1],
                    float( current_block_start_row + block_dim) / (time.time() - start_time),
                    new_time_value, new_time_unit))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()


        self.W_sparse = similarity_builder.get_SparseMatrix()


        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)


        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')

    """ RECOMMENDATIONS """
    def recommend(self, user_id, at=10,users_not_in_train=[], remove_seen = True):
        top_pop_recommendations = [517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]
        # Check if user_id not in train use the topRec
        if user_id in users_not_in_train:
            return top_pop_recommendations

        # Assuming you have Pui, Piu, and W_sparse from your fit method
        # Calculate the predicted scores for all items
        predicted_scores = np.dot(self.Pui[user_id, :], self.W_sparse ).toarray().ravel()
        # Find the items with zero score
        #items2delete = np.where(predicted_scores < 1e-13)[0]
        items2delete = np.array([]) # keep empty array in order not to delete zero scores

        # Find items that the user has already seen
        seen_items = np.array(self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]])

        if remove_seen:
          #Remove seen itmes
          items2delete = np.concatenate([items2delete,seen_items],axis = 0).astype(np.int32)

        # Remove items with zero score and/or seen items
        predicted_scores[items2delete] = -np.inf
        # Sort the items in score descending order
        ranking = predicted_scores.argsort()[::-1]
        # Check if we recommend at least 10 items
        if len(ranking) < 10:
                # new_items = [item for item in top_pop_recommendations if item not in sorted_items]

                # The real one should be this one, but I need a way to avoid having less than 10 recommended items.
                # One possible way may be to add to top_pop_recommendations other items

            new_items = [item for item in top_pop_recommendations if item not in ranking and item not in seen_items]
            ranking = np.concatenate([ranking,new_items[:min(10 - len(ranking), len(new_items))]])
        # Return the top N recommendations
        return ranking[:at]






# Evaluate algoritm


Split the data into train/val


In [None]:
train_test_split = 0.80

n_interactions = urm_all.nnz

train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])

urm_train = sps.csr_matrix((urm_all.data[train_mask],
                            (urm_all.row[train_mask], urm_all.col[train_mask])))

val_mask = np.logical_not(train_mask)

urm_val = sps.csr_matrix((urm_all.data[val_mask],
                            (urm_all.row[val_mask], urm_all.col[val_mask])))

In [None]:
recommender = P3alphaRecommender(urm_train)
recommender.fit(topK=250,alpha=0.2, min_rating=0.1, implicit=True, normalize_similarity=False)

map, mp,mr = evaluate_algorithm(urm_val,recommender)
print(f'MAP - {map}')

P3alphaRecommender: URM Detected 622 ( 4.8%) users with no interactions.
P3alphaRecommender: URM Detected 244 ( 1.1%) items with no interactions.
P3alphaRecommender: Similarity column 22348 (100.0%), 1670.02 column/sec. Elapsed time 13.38 sec
Recommender results are: Precision = 586.2000, Recall = 1004.4931, MAP = 0.0479
MAP - 0.047867463701126704


## Let's create the matrices

In [None]:
# Create an instance of the P3alphaRecommender class
p3alpha_recommender = P3alphaRecommender(urm_all)

# Fit the recommender (you can customize the fit parameters if needed)
p3alpha_recommender.fit(topK=250,alpha=0.2, min_rating=0.1, implicit=True, normalize_similarity=False)

P3alphaRecommender: URM Detected 387 ( 3.0%) users with no interactions.
P3alphaRecommender: URM Detected 126 ( 0.6%) items with no interactions.
P3alphaRecommender: Similarity column 22348 (100.0%), 1737.21 column/sec. Elapsed time 12.86 sec


We can see that W_sparse is N rows (users) * M columns (items). We can see the items are with the proper ID since they go up to the true max ID, 22347. Moreover, W_sparse is not empty!

# Predictions

In [None]:
urm_pred_path = '../content/data_target_users_test.csv'
# urm_pred_link = https://raw.githubusercontent.com/BigDataSeeker/RecSys2023ChallengePolimi/main/content/data_target_users_test.csv?token=GHSAT0AAAAAACJDMXTAFRCU7IGLON6ME7I2ZK4273Q

urm_pred_df = pd.read_csv(filepath_or_buffer=urm_pred_path,
                                sep=",",
                                header=0,
                                dtype={0:int},
                                engine='python')

urm_pred_df.columns = ["UserID"]
len(urm_pred_df['UserID'])
print('Unique user id to predict:', urm_pred_df['UserID'].nunique())

Unique user id to predict: 10882


In [None]:
users_not_in_train = urm_pred_df[~urm_pred_df['UserID'].isin(urm_all_df['UserID'])]

print("Users in urm_pred_df but not in urm_all_orgdf:")
print(users_not_in_train)
print(len(users_not_in_train))

users_not_in_train = users_not_in_train['UserID'].to_numpy()

Users in urm_pred_df but not in urm_all_orgdf:
       UserID
54         60
58         65
147       168
223       261
272       316
...       ...
10682   12775
10699   12798
10729   12837
10802   12921
10856   12992

[221 rows x 1 columns]
221


In [None]:
# Try for one specific user
user_id = 61
print(f'Predicting for user - {user_id}')
prediction = p3alpha_recommender.recommend(user_id,users_not_in_train = users_not_in_train)
print(f"The prediction is {prediction}")

Predicting for user - 61
The prediction is [  6   7  51  15  87 107  88  81 202  29]


In [None]:
# Try for some specific user
for user_id in range(10):
  print("--------")
  print(f'Predicting for user - {user_id}')
  prediction = p3alpha_recommender.recommend(user_id,users_not_in_train = users_not_in_train)
  print(f"The prediction is {prediction}")

--------
Predicting for user - 0
The prediction is [22347  7451  7443  7444  7445  7446  7447  7448  7449  7450]
--------
Predicting for user - 1
The prediction is [  3  36   2   1 101   8   4   6  25  31]
--------
Predicting for user - 2
The prediction is [ 2  4  3  6  8 11  9 19  5 14]
--------
Predicting for user - 3
The prediction is [ 59 259 536   9   1   2 648 584   3 414]
--------
Predicting for user - 4
The prediction is [ 1  2  4  6  7  3 28  9  8  5]
--------
Predicting for user - 5
The prediction is [ 4  2  6  8  7 20  3 14 19 77]
--------
Predicting for user - 6
The prediction is [ 2  6  3  7  8 14 19 20  9 22]
--------
Predicting for user - 7
The prediction is [  4   1   7   3   2 192  88   6  15   8]
--------
Predicting for user - 8
The prediction is [ 4  2  6  3  8  7 17 11  9 80]
--------
Predicting for user - 9
The prediction is [ 859  554 9018 6197 3071 5724 7056 2821 2282 6450]


In [None]:
pred_df = pd.DataFrame(columns = ['user_id','item_list'])

for userid in urm_pred_df['UserID']:
  recommendations = p3alpha_recommender.recommend(userid, at=10,remove_seen = True, users_not_in_train=users_not_in_train)
  recommendations = " ".join(str(item) for item in recommendations)
  pred_df.loc[len(pred_df)] = [userid,recommendations]

In [None]:
pred_df

Unnamed: 0,user_id,item_list
0,1,3 36 2 1 101 8 4 6 25 31
1,2,2 4 3 6 8 11 9 19 5 14
2,3,59 259 536 9 1 2 648 584 3 414
3,4,1 2 4 6 7 3 28 9 8 5
4,5,4 2 6 8 7 20 3 14 19 77
...,...,...
10877,13020,1 2 7 4 3 6 51 34 8 85
10878,13021,2 1 3 8 6 133 20 9 13 32
10879,13022,809 1411 1446 1668 1674 3 39 148 20 41
10880,13023,32 3 4 2 7 1 96 20 138 19


In [None]:
pred_df.to_csv('PredP3alphaMax.csv',index=False)

# Example to test code

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize

# Example User-Item Interaction Matrix
URM_train = csr_matrix([[1, 2, 0], [0, 1, 0], [2, 0, 1]], dtype=np.float32)

# Normalize along axis 1 (row-wise) with L1 norm
Pui = normalize(URM_train, norm='l1', axis=1)

print("Original URM_train:")
print(URM_train.toarray())
print("\nNormalized Pui:")
print(Pui.toarray())

Original URM_train:
[[1. 2. 0.]
 [0. 1. 0.]
 [2. 0. 1.]]

Normalized Pui:
[[0.33333334 0.6666667  0.        ]
 [0.         1.         0.        ]
 [0.6666667  0.         0.33333334]]


In [None]:
# Transpose URM_train and set non-zero values to 1 (boolean)
X_bool = URM_train.transpose(copy=True)
X_bool.data = np.ones(X_bool.data.size, np.float32)

# Normalize along axis 1 (column-wise) with L1 norm
Piu = normalize(X_bool, norm='l1', axis=1)

print("Original URM_train:")
print(URM_train.toarray())
print("\nTransposed and Boolean X_bool:")
print(X_bool.toarray())
print("\nNormalized Piu:")
print(Piu.toarray())


Original URM_train:
[[1. 2. 0.]
 [0. 1. 0.]
 [2. 0. 1.]]

Transposed and Boolean X_bool:
[[1. 0. 1.]
 [1. 1. 0.]
 [0. 0. 1.]]

Normalized Piu:
[[0.5 0.  0.5]
 [0.5 0.5 0. ]
 [0.  0.  1. ]]


In [None]:
# Create an instance of the P3alphaRecommender class
p3alpha_recommender = P3alphaRecommender(URM_train)

# Fit the recommender (you can customize the fit parameters if needed)
p3alpha_recommender.fit(topK=3, alpha=1.0, min_rating=0, implicit=False, normalize_similarity=False)

P3alphaRecommender: Similarity column 3 (100.0%), 2966.27 column/sec. Elapsed time 0.00 sec


In [None]:
p3alpha_recommender.W_sparse.toarray()

array([[0.        , 0.33333334, 0.16666667],
       [0.16666667, 0.        , 0.        ],
       [0.6666667 , 0.        , 0.        ]], dtype=float32)

As you can see the matrix is not symmetic.

To get the recommendations we need to multiply Pui and W_sparse, obtaining:

In [None]:
recommender_matrix = Pui*p3alpha_recommender.W_sparse
recommender_matrix.toarray()

array([[0.11111112, 0.11111112, 0.05555556],
       [0.16666667, 0.        , 0.        ],
       [0.22222224, 0.22222224, 0.11111112]], dtype=float32)

For user 1 we will recommend items 1 and 2 and then 3.

Now we need to make the predictions

In [None]:
# user_id = 0
print( p3alpha_recommender.recommend((0)) )

[517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]


In [None]:
for user_id in range(4):
  recommendations = p3alpha_recommender.recommend(user_id)
  print(recommendations)


[517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]
[0, 517, 189, 44, 0, 284, 808, 285, 1, 557]
[517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]
[517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]
