# Import Libraries and Dataset

In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import random
import pandas as pd
from scipy import stats
from scipy.optimize import fmin
import scipy.sparse as sps
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
# Numpy cries because it is an old code so we monkey patch it
np.int = int
np.bool = bool
np.float = float


In [2]:
URM_all_dataframe = pd.read_csv('data_train.csv', 
                                sep=",", 
                                header= 0, 
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]


# Build the COO sparse matrix associated with the URM
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values))) ## .values --> numpy array, df[..] --> pd series



### Split the dataset 80/20

In [3]:
URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 2913 (22.4%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2573 (19.8%) Users that have less than 1 test interactions


## Build SLIMElasticNetRecommender

In [4]:
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender
recommender = P3alphaRecommender(URM_train, verbose = True)

P3alphaRecommender: URM Detected 861 ( 6.6%) users with no interactions.
P3alphaRecommender: URM Detected 455 ( 2.0%) items with no interactions.


In [14]:
%load_ext Cython

In [15]:
%%cython

import numpy as np
import time
import scipy.sparse as sps

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_2_input, n_epochs,evaluator_validation, recommender_object,SM):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr
    cdef double[:] URM_train_data = URM_train.data

    cdef double[:,:] item_item_S = SM
    cdef double learning_rate = learning_rate_input
    cdef double regularization_2 = regularization_2_input
    cdef double loss = 0.0
    cdef long start_time
    cdef double true_rating, predicted_rating, prediction_error, profile_rating
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, profile_item_id
    
    for n_epoch in range(n_epochs):
        
        loss = 0.0
        start_time = time.time()
        
        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions

            user_id = URM_train_coo_row[index]
            item_id = URM_train_coo_col[index]
            true_rating = URM_train_coo_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                predicted_rating += item_item_S[profile_item_id,item_id] * profile_rating

            # Compute prediction error, or gradient
            prediction_error = true_rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                item_item_S[profile_item_id,item_id] += learning_rate * (prediction_error * profile_rating - 
                                                                         regularization_2 * item_item_S[profile_item_id,item_id])

            # Ensure diagonal is always zero
            item_item_S[item_id,item_id] = 0.0
        
#             if sample_num % 1000000 == 0:
#                 print("Epoch {}: {:.2f}%".format(n_epoch+1, sample_num/n_interactions*100))
            
        #result_df, _ = evaluator_validation.evaluateRecommender(recommender_object)
        #val_MAP = result_df.loc[10]["MAP"]
        elapsed_time = time.time() - start_time
        samples_per_second = (sample_num+1)/elapsed_time

        if(n_epoch < 1000 ):
            print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}.".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second))
        else:
            sparse_similarity = sps.csr_matrix(item_item_S)
            recommender_object.set_similarity_matrix(sparse_similarity)
            result_df, _ = evaluator_validation.evaluateRecommender(recommender_object)
            val_MAP = result_df.loc[10]["MAP"]
            print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}. Validation MAP {:.4f}".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second,val_MAP))

    return np.array(item_item_S), loss/(sample_num+1), val_MAP

Content of stdout:
_cython_magic_7dfaf4401a5ee9adcf9cfcc073561b3c96f72579.c
   Creazione della libreria C:\Users\Tommaso\.ipython\cython\Users\Tommaso\.ipython\cython\_cython_magic_7dfaf4401a5ee9adcf9cfcc073561b3c96f72579.cp311-win_amd64.lib e dell'oggetto C:\Users\Tommaso\.ipython\cython\Users\Tommaso\.ipython\cython\_cython_magic_7dfaf4401a5ee9adcf9cfcc073561b3c96f72579.cp311-win_amd64.exp
Generazione codice in corso...
Generazione codice terminata

### Train the model

In [28]:
regularizations_array = np.logspace(-2.5, 1, num=5)

In [29]:
sim_matrix_list = []
loss_list = []
val_map_list = []
learning_rate = 1e-4

In [None]:
for i in range(len(regularizations_array)):
    # Initializing the similarity matrix
    n_items = URM_train.shape[1]
    item_item_S = np.zeros((n_items, n_items), dtype = float)
    item_item_S, loss, map = train_multiple_epochs(URM_train, learning_rate, regularizations_array[i],1001, evaluator_validation, recommender,item_item_S)
    sim_matrix_list.append(item_item_S)
    loss_list.append(loss)
    val_map_list.append(map)

### Setting the Similarity matrix in a recommender_object instance

In [31]:
val_map_list

[0.010403661497753383,
 0.010081089892359028,
 0.008689386834901055,
 0.007378339506348257,
 0.008551618877111238]

In [5]:
item_item_S = np.load('W_sparse.npz')

In [6]:
indices = item_item_S['indices']
indptr = item_item_S['indptr']
format = item_item_S['format']
shape = item_item_S['shape']
data = item_item_S['data']

In [7]:
from scipy.sparse import csr_matrix

In [8]:
sparse_similarity = csr_matrix((data, indices, indptr), shape=shape)

In [9]:
recommender.set_similarity_matrix(sparse_similarity)

### Evaluation

In [52]:
result_df, _ = evaluator_validation.evaluateRecommender(recommender)
result_df.loc[10]

EvaluatorHoldout: Processed 10054 (100.0%) in 6.73 sec. Users per second: 1493


PRECISION                      0.128218
PRECISION_RECALL_MIN_DEN       0.264678
RECALL                         0.241776
MAP                            0.065091
MAP_MIN_DEN                    0.133504
MRR                            0.328312
NDCG                            0.22052
F1                              0.16757
HIT_RATE                       0.638651
ARHR_ALL_HITS                  0.455674
NOVELTY                        0.005334
AVERAGE_POPULARITY             0.177179
DIVERSITY_MEAN_INTER_LIST      0.980729
DIVERSITY_HERFINDAHL           0.998063
COVERAGE_ITEM                  0.468409
COVERAGE_ITEM_HIT              0.248568
ITEMS_IN_GT                     0.73756
COVERAGE_USER                    0.7719
COVERAGE_USER_HIT              0.492975
USERS_IN_GT                      0.7719
DIVERSITY_GINI                  0.09824
SHANNON_ENTROPY               10.774877
RATIO_DIVERSITY_HERFINDAHL     0.998424
RATIO_DIVERSITY_GINI           0.286529
RATIO_SHANNON_ENTROPY          0.830663


In [53]:
result_df

Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.128218,0.264678,0.241776,0.065091,0.133504,0.328312,0.22052,0.16757,0.638651,0.455674,...,0.7719,0.492975,0.7719,0.09824,10.774877,0.998424,0.286529,0.830663,1.49395,0.390637


In [58]:
type(recommender.recommend(41)[:10])

list

### Write the predictions on submission.csv


In [10]:
def write_predictions(recommender_object, at=10):
    prediction_df = pd.read_csv('data_target_users_test.csv', sep= ",",
                                header=0, 
                                dtype={0:int},
                                engine='python')
    #prediction_df['mapped user'] = prediction_df['user_id'].map(user_original_ID_to_index).fillna(-1).astype(int)
    #prediction_df['item_list'] = prediction_df['mapped user'].apply(recommender_object.recommend)
    prediction_df['item_list'] = prediction_df['user_id'].apply(lambda user_id: recommender_object.recommend(user_id)[:at])
    #def map_items(item_list):
    #    return [index_to_item_original.get(item, item) for item in item_list]

    #prediction_df['item_list'] = prediction_df['item_list'].apply(map_items)
    def transform_items_to_string(item_list):
        return ' '.join(map(str, item_list))

    prediction_df['item_list'] = prediction_df['item_list'].apply(transform_items_to_string)
    print(prediction_df.head(10))
    #del prediction_df['mapped user']
    prediction_df.to_csv('submission.csv',index=False)

write_predictions(recommender)

   user_id                                        item_list
0        1            36 101 354 1 123 139 254 4034 146 318
1        2                  11 50 47 1227 1095 28 102 2 3 1
2        3            59 2 648 259 584 450 414 536 4252 399
3        4               11 50 47 249 28 252 353 129 51 133
4        5           3063 202 766 256 270 192 301 471 4 447
5        6                35 9 692 88 14 168 56 886 104 395
6        8          210 682 451 778 760 366 443 480 722 600
7        9  2821 2282 8743 341 9018 1460 227 1366 2508 1206
8       10     2145 1816 1618 561 1411 3721 4267 31 3905 67
9       11                   31 67 39 40 99 955 58 93 32 34


In [64]:
prediction_df = pd.read_csv('submission.csv')