#Connect to drive

In [1]:
!git clone https://github.com/MaurizioFD/RecSys_Course_AT_PoliMi
%cd RecSys_Course_AT_PoliMi
!pip install -r requirements.txt
!python run_compile_all_cython.py

Cloning into 'RecSys_Course_AT_PoliMi'...
remote: Enumerating objects: 1493, done.[K
remote: Counting objects: 100% (238/238), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 1493 (delta 110), reused 225 (delta 97), pack-reused 1255[K
Receiving objects: 100% (1493/1493), 50.31 MiB | 25.30 MiB/s, done.
Resolving deltas: 100% (841/841), done.
/content/RecSys_Course_AT_PoliMi
Collecting Cython==0.29.23 (from -r requirements.txt (line 1))
  Downloading Cython-0.29.23-py2.py3-none-any.whl (978 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.0/978.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h5py==3.1.0 (from -r requirements.txt (line 2))
  Downloading h5py-3.1.0.tar.gz (371 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m371.4/371.4 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


#Import Library

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from typing import Tuple, Callable, Dict, Optional, List
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.Similarity.Compute_Similarity_Python import Compute_Similarity_Python
from scipy.stats import loguniform
!pip install optuna
import optuna



#Load Data

In [9]:
%cd /gdrive/MyDrive/RECSYS
data_train = pd.read_csv('data_train.csv')
data_target_users_test = pd.read_csv('data_target_users_test.csv')

/gdrive/MyDrive/RECSYS


In [10]:
data_train.columns = ["user_id", "item_id", "Interaction"]

In [11]:
print ("The number of interactions is {}".format(len(data_train)))

The number of interactions is 478730


In [12]:
userID_unique = data_train["user_id"].unique()
itemID_unique = data_train["item_id"].unique()

In [13]:
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(data_train)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024



#Splitting Data

In [14]:
def preprocess_data(interactions: pd.DataFrame):
    unique_users = interactions.user_id.unique()
    unique_items = interactions.item_id.unique()

    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})

    interactions = pd.merge(left=interactions,
                       right=mapping_user_id,
                       how="inner",
                       on="user_id")

    interactions = pd.merge(left=interactions,
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")

    return interactions


In [15]:
interactions = preprocess_data(data_train)

12638 1 13024
22222 1 22347


In [16]:
interactions

Unnamed: 0,user_id,item_id,Interaction,mapped_user_id,mapped_item_id
0,1,7,1.0,0,0
1,2,7,1.0,1,0
2,26,7,1.0,24,0
3,36,7,1.0,34,0
4,41,7,1.0,39,0
...,...,...,...,...,...
478725,12962,20368,1.0,12579,22217
478726,12985,21058,1.0,12601,22218
478727,12989,22317,1.0,12605,22219
478728,13009,22339,1.0,12624,22220


In [17]:
def dataset_splits(interactions, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1234

    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     interactions_training, interactions_test) = train_test_split(interactions.mapped_user_id,
                                                        interactions.mapped_item_id,
                                                        interactions.Interaction,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)

    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     interactions_training, interactions_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              interactions_training,
                                                              test_size=validation_percentage,
                                                             )

    urm_train = sp.csr_matrix((interactions_training, (user_ids_training, item_ids_training)),
                              shape=(num_users, num_items))

    urm_validation = sp.csr_matrix((interactions_validation, (user_ids_validation, item_ids_validation)),
                              shape=(num_users, num_items))

    urm_test = sp.csr_matrix((interactions_test, (user_ids_test, item_ids_test)),
                              shape=(num_users, num_items))



    return urm_train, urm_validation, urm_test



In [30]:
urm_train, urm_validation, urm_test = dataset_splits(interactions,
                                                     num_users=12638,
                                                     num_items=22222,
                                                     validation_percentage=0.15,
                                                     testing_percentage=0.15)

In [19]:
urm_train

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 344685 stored elements in Compressed Sparse Row format>

In [20]:
urm_validation

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 38299 stored elements in Compressed Sparse Row format>

In [21]:
urm_test

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 95746 stored elements in Compressed Sparse Row format>

 # Collaborative Filtering


##Item Item Similarity

In [None]:
def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()

    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6

        weights[item_id] = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    return weights

In [None]:
urm_csc = urm_train.tocsc()
shrink = 5
slice_size = 100

In [None]:
vector_weights = vector_similarity(urm_csc[:slice_size,:slice_size], shrink)
vector_weights

#

#Build the Recommendation System

In [66]:
class CFItemKNN(object):
    def __init__(self, urm: int):
        self.urm = urm
        self.weights = None


    def fit(self, urm_train: sp.csc_matrix, topK, shrink, normalize, similarity):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")

        similarity_object = Compute_Similarity_Python(self.urm, shrink=shrink,
                                                  topK=topK, normalize=normalize,
                                                  similarity = similarity)

        self.weights = similarity_object.compute_similarity()

    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]

        ranking = user_profile.dot(self.weights).flatten()

        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]

            seen_items = urm_train.indices[user_profile_start:user_profile_end]

            ranking[seen_items] = -np.inf

        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

In [None]:
itemknn_recommender = CFItemKNN(shrink=50)
itemknn_recommender

In [None]:
itemknn_recommender.fit(urm_train.tocsc(), vector_similarity)

In [None]:
for user_id in range(10):
    print(itemknn_recommender.recommend(user_id=user_id, urm_train=urm_train, at=10, remove_seen=True))

## Top popular for the users without info

In [47]:
class TopPopRecommender(object):

    def fit(self, URM_train):

        item_popularity = np.ediff1d(URM_train.tocsc().indptr)

        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popular_items = np.argsort(item_popularity)
        self.popular_items = np.flip(self.popular_items, axis = 0)


    def recommend(self, user_id, at=5):

        recommended_items = self.popular_items[0:at]

        return recommended_items

#Hyperparameter Tuning


*   Number of neighbors

*   Shrinking

*   Similarity Type



EvaluatorHoldout: Ignoring 2697 (21.3%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2152 (17.0%) Users that have less than 1 test interactions


In [70]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
def objective_function(optuna_trial):

    recommender_instance = ItemKNNCFRecommender(urm_train)
    recommender_instance.fit(topK = optuna_trial.suggest_int("topK", 5, 1000),
                             shrink = optuna_trial.suggest_int("shrink", 0, 1000),
                             similarity =  optuna_trial.suggest_categorical("similarity", ["cosine", "jaccard", "adjusted", "asymmetric","pearson",  "dice" , "tversky",  "tanimoto"]),
                             normalize = optuna_trial.suggest_categorical("normalize", [True, False])
                            )

    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)

    return result_df.loc[10]["MAP"]

In [62]:
class SaveResults(object):

    def __init__(self):
        self.results_df = pd.DataFrame()

    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]

        self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)

In [71]:
optuna_study = optuna.create_study(direction="maximize")

save_results = SaveResults()

optuna_study.optimize(objective_function,
                      callbacks=[save_results],
                      n_trials = 50)

[I 2023-11-24 15:15:31,893] A new study created in memory with name: no-name-02eef95d-7ab6-4ffa-8887-01dc35f65f2f


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.
Similarity column 22222 (100.0%), 2876.32 column/sec. Elapsed time 7.73 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.96 sec. Users per second: 767


[I 2023-11-24 15:15:53,131] Trial 0 finished with value: 0.017172029258338415 and parameters: {'topK': 569, 'shrink': 528, 'similarity': 'asymmetric', 'normalize': True}. Best is trial 0 with value: 0.017172029258338415.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3996.72 column/sec. Elapsed time 5.56 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 5.12 sec. Users per second: 1943


[I 2023-11-24 15:16:05,660] Trial 1 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 158, 'shrink': 507, 'similarity': 'pearson', 'normalize': False}. Best is trial 0 with value: 0.017172029258338415.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2972.59 column/sec. Elapsed time 7.48 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 13.39 sec. Users per second: 742


[I 2023-11-24 15:16:27,127] Trial 2 finished with value: 0.020821074498269972 and parameters: {'topK': 799, 'shrink': 314, 'similarity': 'dice', 'normalize': False}. Best is trial 2 with value: 0.020821074498269972.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3589.02 column/sec. Elapsed time 6.19 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.27 sec. Users per second: 968


[I 2023-11-24 15:16:43,825] Trial 3 finished with value: 0.02307785378175687 and parameters: {'topK': 125, 'shrink': 711, 'similarity': 'dice', 'normalize': False}. Best is trial 3 with value: 0.02307785378175687.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3652.08 column/sec. Elapsed time 6.08 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 5.22 sec. Users per second: 1906


[I 2023-11-24 15:16:56,454] Trial 4 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 989, 'shrink': 670, 'similarity': 'pearson', 'normalize': True}. Best is trial 3 with value: 0.02307785378175687.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4290.21 column/sec. Elapsed time 5.18 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.94 sec. Users per second: 909


[I 2023-11-24 15:17:12,864] Trial 5 finished with value: 0.025600195917819854 and parameters: {'topK': 217, 'shrink': 108, 'similarity': 'dice', 'normalize': True}. Best is trial 5 with value: 0.025600195917819854.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5060.01 column/sec. Elapsed time 4.39 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 14.70 sec. Users per second: 676


[I 2023-11-24 15:17:32,590] Trial 6 finished with value: 0.014091013966529481 and parameters: {'topK': 812, 'shrink': 26, 'similarity': 'cosine', 'normalize': False}. Best is trial 5 with value: 0.025600195917819854.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3173.16 column/sec. Elapsed time 7.00 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 13.40 sec. Users per second: 742


[I 2023-11-24 15:17:53,689] Trial 7 finished with value: 0.016018042163047627 and parameters: {'topK': 749, 'shrink': 775, 'similarity': 'cosine', 'normalize': True}. Best is trial 5 with value: 0.025600195917819854.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3332.98 column/sec. Elapsed time 6.67 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.03 sec. Users per second: 826


[I 2023-11-24 15:18:12,699] Trial 8 finished with value: 0.02380722053768032 and parameters: {'topK': 240, 'shrink': 300, 'similarity': 'jaccard', 'normalize': False}. Best is trial 5 with value: 0.025600195917819854.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3548.60 column/sec. Elapsed time 6.26 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 14.55 sec. Users per second: 683


[I 2023-11-24 15:18:34,031] Trial 9 finished with value: 0.016005040852138656 and parameters: {'topK': 855, 'shrink': 743, 'similarity': 'cosine', 'normalize': True}. Best is trial 5 with value: 0.025600195917819854.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4711.51 column/sec. Elapsed time 4.72 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.12 sec. Users per second: 820


[I 2023-11-24 15:18:51,288] Trial 10 finished with value: 0.02593304065414522 and parameters: {'topK': 394, 'shrink': 6, 'similarity': 'tversky', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4667.22 column/sec. Elapsed time 4.76 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.71 sec. Users per second: 849


[I 2023-11-24 15:19:08,166] Trial 11 finished with value: 0.024231896762326382 and parameters: {'topK': 368, 'shrink': 0, 'similarity': 'tversky', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5615.35 column/sec. Elapsed time 3.96 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 5.27 sec. Users per second: 1886


[I 2023-11-24 15:19:18,515] Trial 12 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 8, 'shrink': 182, 'similarity': 'adjusted', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2719.02 column/sec. Elapsed time 8.17 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.44 sec. Users per second: 799


[I 2023-11-24 15:19:39,530] Trial 13 finished with value: 0.019197184020642452 and parameters: {'topK': 480, 'shrink': 993, 'similarity': 'tversky', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3587.71 column/sec. Elapsed time 6.19 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.96 sec. Users per second: 831


[I 2023-11-24 15:19:58,080] Trial 14 finished with value: 0.024522228768783502 and parameters: {'topK': 342, 'shrink': 161, 'similarity': 'tanimoto', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3598.17 column/sec. Elapsed time 6.18 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 13.26 sec. Users per second: 750


[I 2023-11-24 15:20:17,999] Trial 15 finished with value: 0.023403209890736257 and parameters: {'topK': 509, 'shrink': 153, 'similarity': 'tversky', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4821.06 column/sec. Elapsed time 4.61 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 13.42 sec. Users per second: 741


[I 2023-11-24 15:20:36,422] Trial 16 finished with value: 0.02260270117502792 and parameters: {'topK': 357, 'shrink': 344, 'similarity': 'dice', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5136.49 column/sec. Elapsed time 4.33 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 14.12 sec. Users per second: 704


[I 2023-11-24 15:20:55,381] Trial 17 finished with value: 0.02345636876619678 and parameters: {'topK': 625, 'shrink': 98, 'similarity': 'jaccard', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4977.45 column/sec. Elapsed time 4.46 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.97 sec. Users per second: 766


[I 2023-11-24 15:21:13,079] Trial 18 finished with value: 0.020600076163651233 and parameters: {'topK': 241, 'shrink': 261, 'similarity': 'asymmetric', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5495.47 column/sec. Elapsed time 4.04 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 6.06 sec. Users per second: 1641


[I 2023-11-24 15:21:23,904] Trial 19 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 13, 'shrink': 392, 'similarity': 'adjusted', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3346.48 column/sec. Elapsed time 6.64 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.74 sec. Users per second: 847


[I 2023-11-24 15:21:42,724] Trial 20 finished with value: 0.02526876827249019 and parameters: {'topK': 417, 'shrink': 69, 'similarity': 'tanimoto', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3674.44 column/sec. Elapsed time 6.05 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.11 sec. Users per second: 821


[I 2023-11-24 15:22:01,327] Trial 21 finished with value: 0.025061214339204472 and parameters: {'topK': 428, 'shrink': 76, 'similarity': 'tanimoto', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4109.22 column/sec. Elapsed time 5.41 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.90 sec. Users per second: 835


[I 2023-11-24 15:22:19,004] Trial 22 finished with value: 0.02464731598973626 and parameters: {'topK': 264, 'shrink': 195, 'similarity': 'tanimoto', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4102.33 column/sec. Elapsed time 5.42 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 13.90 sec. Users per second: 715


[I 2023-11-24 15:22:38,887] Trial 23 finished with value: 0.02337209775772299 and parameters: {'topK': 659, 'shrink': 70, 'similarity': 'dice', 'normalize': True}. Best is trial 10 with value: 0.02593304065414522.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5050.57 column/sec. Elapsed time 4.40 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 8.67 sec. Users per second: 1147


[I 2023-11-24 15:22:52,240] Trial 24 finished with value: 0.02761583421552252 and parameters: {'topK': 139, 'shrink': 14, 'similarity': 'tanimoto', 'normalize': True}. Best is trial 24 with value: 0.02761583421552252.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3353.44 column/sec. Elapsed time 6.63 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 9.67 sec. Users per second: 1028


[I 2023-11-24 15:23:08,805] Trial 25 finished with value: 0.02530825521369738 and parameters: {'topK': 125, 'shrink': 242, 'similarity': 'tversky', 'normalize': True}. Best is trial 24 with value: 0.02761583421552252.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3255.72 column/sec. Elapsed time 6.83 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.87 sec. Users per second: 915


[I 2023-11-24 15:23:26,829] Trial 26 finished with value: 0.023744940386374844 and parameters: {'topK': 186, 'shrink': 402, 'similarity': 'tanimoto', 'normalize': False}. Best is trial 24 with value: 0.02761583421552252.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3196.31 column/sec. Elapsed time 6.95 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 7.54 sec. Users per second: 1319


[I 2023-11-24 15:23:41,569] Trial 27 finished with value: 0.0278971447412753 and parameters: {'topK': 104, 'shrink': 8, 'similarity': 'tversky', 'normalize': True}. Best is trial 27 with value: 0.0278971447412753.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2749.53 column/sec. Elapsed time 8.08 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 7.44 sec. Users per second: 1337


[I 2023-11-24 15:23:57,263] Trial 28 finished with value: 0.028057196187666136 and parameters: {'topK': 73, 'shrink': 36, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3614.18 column/sec. Elapsed time 6.15 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.24 sec. Users per second: 971


[I 2023-11-24 15:24:13,800] Trial 29 finished with value: 0.022801046012745036 and parameters: {'topK': 64, 'shrink': 578, 'similarity': 'asymmetric', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3681.20 column/sec. Elapsed time 6.04 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.65 sec. Users per second: 934


[I 2023-11-24 15:24:30,680] Trial 30 finished with value: 0.02314109196641132 and parameters: {'topK': 81, 'shrink': 866, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4177.58 column/sec. Elapsed time 5.32 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 9.54 sec. Users per second: 1042


[I 2023-11-24 15:24:45,744] Trial 31 finished with value: 0.02803506960910655 and parameters: {'topK': 93, 'shrink': 13, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4824.11 column/sec. Elapsed time 4.61 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.44 sec. Users per second: 799


[I 2023-11-24 15:25:03,123] Trial 32 finished with value: 0.02508362433596315 and parameters: {'topK': 298, 'shrink': 135, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4879.29 column/sec. Elapsed time 4.55 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 9.67 sec. Users per second: 1028


[I 2023-11-24 15:25:17,514] Trial 33 finished with value: 0.025299644888972028 and parameters: {'topK': 80, 'shrink': 0, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5995.97 column/sec. Elapsed time 3.71 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 5.25 sec. Users per second: 1892


[I 2023-11-24 15:25:27,212] Trial 34 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 158, 'shrink': 224, 'similarity': 'pearson', 'normalize': False}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2842.35 column/sec. Elapsed time 7.82 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 8.07 sec. Users per second: 1231


[I 2023-11-24 15:25:43,375] Trial 35 finished with value: 0.027102939086642984 and parameters: {'topK': 128, 'shrink': 54, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2883.57 column/sec. Elapsed time 7.71 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 8.00 sec. Users per second: 1243


[I 2023-11-24 15:25:59,247] Trial 36 finished with value: 0.026816095918298956 and parameters: {'topK': 55, 'shrink': 122, 'similarity': 'tversky', 'normalize': False}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3970.14 column/sec. Elapsed time 5.60 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 6.78 sec. Users per second: 1467


[I 2023-11-24 15:26:12,362] Trial 37 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 191, 'shrink': 609, 'similarity': 'pearson', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4732.53 column/sec. Elapsed time 4.70 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.99 sec. Users per second: 829


[I 2023-11-24 15:26:29,356] Trial 38 finished with value: 0.026148047288526148 and parameters: {'topK': 297, 'shrink': 51, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5403.49 column/sec. Elapsed time 4.11 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 10.51 sec. Users per second: 945


[I 2023-11-24 15:26:44,198] Trial 39 finished with value: 0.02622623877703848 and parameters: {'topK': 129, 'shrink': 116, 'similarity': 'jaccard', 'normalize': False}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5647.18 column/sec. Elapsed time 3.94 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 5.25 sec. Users per second: 1893


[I 2023-11-24 15:26:54,240] Trial 40 finished with value: 4.8783856499378085e-05 and parameters: {'topK': 43, 'shrink': 504, 'similarity': 'adjusted', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 2744.64 column/sec. Elapsed time 8.10 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 8.14 sec. Users per second: 1221


[I 2023-11-24 15:27:10,697] Trial 41 finished with value: 0.026903728027106024 and parameters: {'topK': 139, 'shrink': 60, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3006.98 column/sec. Elapsed time 7.39 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 8.28 sec. Users per second: 1200


[I 2023-11-24 15:27:26,579] Trial 42 finished with value: 0.02745561910510113 and parameters: {'topK': 109, 'shrink': 40, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3836.59 column/sec. Elapsed time 5.79 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 9.64 sec. Users per second: 1031


[I 2023-11-24 15:27:42,294] Trial 43 finished with value: 0.026697858635792503 and parameters: {'topK': 181, 'shrink': 2, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5218.03 column/sec. Elapsed time 4.26 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.47 sec. Users per second: 867


[I 2023-11-24 15:27:58,160] Trial 44 finished with value: 0.024479676120859075 and parameters: {'topK': 85, 'shrink': 202, 'similarity': 'cosine', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 4838.77 column/sec. Elapsed time 4.59 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.28 sec. Users per second: 881


[I 2023-11-24 15:28:14,297] Trial 45 finished with value: 0.026886990386135408 and parameters: {'topK': 216, 'shrink': 38, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5080.77 column/sec. Elapsed time 4.37 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 11.78 sec. Users per second: 844


[I 2023-11-24 15:28:30,605] Trial 46 finished with value: 0.024709129099784 and parameters: {'topK': 107, 'shrink': 113, 'similarity': 'asymmetric', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 5463.76 column/sec. Elapsed time 4.07 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 7.06 sec. Users per second: 1407


[I 2023-11-24 15:28:41,862] Trial 47 finished with value: 0.025427226988438145 and parameters: {'topK': 5, 'shrink': 152, 'similarity': 'tanimoto', 'normalize': False}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3273.25 column/sec. Elapsed time 6.79 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.24 sec. Users per second: 812


[I 2023-11-24 15:29:01,251] Trial 48 finished with value: 0.02263621637502538 and parameters: {'topK': 281, 'shrink': 450, 'similarity': 'tversky', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.


ItemKNNCFRecommender: URM Detected 454 ( 3.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 348 ( 1.6%) items with no interactions.


  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


Similarity column 22222 (100.0%), 3948.49 column/sec. Elapsed time 5.63 sec
EvaluatorHoldout: Processed 9941 (100.0%) in 12.14 sec. Users per second: 819


[I 2023-11-24 15:29:19,319] Trial 49 finished with value: 0.021037490240035272 and parameters: {'topK': 220, 'shrink': 297, 'similarity': 'cosine', 'normalize': True}. Best is trial 28 with value: 0.028057196187666136.
  self.results_df = self.results_df.append(hyperparam_dict, ignore_index=True)


In [72]:
optuna_study.best_trial.params

{'topK': 73, 'shrink': 36, 'similarity': 'tversky', 'normalize': True}

In [73]:
recommender_instance = ItemKNNCFRecommender(urm_train + urm_validation)
recommender_instance.fit(**optuna_study.best_trial.params)

result_df, _ = evaluator_test.evaluateRecommender(recommender_instance)
result_df

ItemKNNCFRecommender: URM Detected 230 ( 1.8%) users with no interactions.
ItemKNNCFRecommender: URM Detected 93 ( 0.4%) items with no interactions.
Similarity column 22222 (100.0%), 3723.03 column/sec. Elapsed time 5.97 sec
EvaluatorHoldout: Processed 10486 (100.0%) in 10.01 sec. Users per second: 1048


Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,NOVELTY,AVERAGE_POPULARITY,DIVERSITY_MEAN_INTER_LIST,DIVERSITY_HERFINDAHL,COVERAGE_ITEM,COVERAGE_ITEM_HIT,ITEMS_IN_GT,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
10,0.082348,0.146702,0.123929,0.041817,0.07281,0.242712,0.132591,0.098947,0.475682,0.315258,0.004948,0.265236,0.960099,0.996001,0.307668,0.076636,0.801548,0.82972,0.394683,0.82972,0.047566,9.711469,0.996361,0.13656,0.748157,2.082066,0.279549


#Evaluate the Recommendation System

In [None]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant) / relevant_items.shape[0]

    return recall_score


def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [74]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0

    num_users = urm_train.shape[0]

    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]

        relevant_items = urm_test.indices[user_profile_start:user_profile_end]

        if relevant_items.size == 0:
            num_users_skipped += 1
            continue

        recommendations = recommender.recommend(user_id=user_id,
                                               at=recommendation_length,
                                               urm_train=urm_train,
                                               remove_seen=True)

        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)

        num_users_evaluated += 1


    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)

    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

In [75]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(itemknn_recommender,
                                                                                            urm_train,
                                                                                            urm_test)

NameError: ignored

In [None]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

#Prepare Submission

In [76]:
#forse ho sbagliato ad agggiungere anche il test set qui
best_shrink = 5
urm_train_validation = urm_train + urm_validation + urm_test
urm_train_validation

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [77]:
best_recommender = ItemKNNCFRecommender(urm_train_validation)
best_recommender.fit(**optuna_study.best_trial.params)

Similarity column 22222 (100.0%), 3139.52 column/sec. Elapsed time 7.08 sec


##Top popular for those users without info

In [78]:
toppop_recommender = TopPopRecommender()
toppop_recommender.fit(urm_train_validation)

In [79]:
users_to_recommend = np.array(data_target_users_test["user_id"])
len(users_to_recommend)

10882

In [80]:
mapping_to_item_id = dict(zip(interactions.mapped_item_id, interactions.item_id))

In [97]:
def prepare_submission(interactions: pd.DataFrame, users_to_recommend: np.array, urm_train: sp.csr_matrix, recommender: object, toppop_recommender: object):
    users_ids_and_mappings = interactions[interactions.user_id.isin(users_to_recommend)][['user_id', 'mapped_user_id']].drop_duplicates()
    items_ids_and_mappings = interactions[["item_id", "mapped_item_id"]].drop_duplicates()

    mapping_to_item_id = dict(zip(interactions.mapped_item_id, interactions.item_id))


    recommendation_length = 10
    submission = []
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id

        recommendations = recommender.recommend(user_id_array=mapped_user_id, cutoff=10)

        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))

    #assign top popular to the user without interactions
    users_without_info = np.setdiff1d(users_to_recommend,np.array(interactions[interactions.user_id.isin(users_to_recommend)][['user_id', 'mapped_user_id']].drop_duplicates().user_id))
    for user_id in users_without_info:
      recommendations = toppop_recommender.recommend(user_id, at = 10)
      submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))

    return submission

In [98]:
submission = prepare_submission(interactions, users_to_recommend, urm_train_validation, best_recommender, toppop_recommender)

In [99]:
submission

[(1, [101, 506, 403, 515, 36, 1546, 637, 977, 922, 869]),
 (2, [11, 47, 28, 50, 3, 2, 6, 4, 12, 5]),
 (26, [1089, 139, 254, 314, 1256, 1005, 474, 1620, 269, 41]),
 (36, [4, 3, 6, 15, 16, 131, 14, 20, 107, 48]),
 (41, [31, 55, 58, 486, 20, 4, 592, 1, 33, 9]),
 (47, [96, 214, 344, 211, 38, 154, 1635, 85, 15, 72]),
 (54, [19, 2, 6, 4, 1, 8, 20, 238, 24, 95]),
 (73, [21, 125, 85, 51, 154, 34, 109, 3, 32, 81]),
 (88, [99, 346, 1089, 139, 4, 3, 318, 563, 60, 2204]),
 (89, [138, 324, 344, 154, 10, 59, 433, 12673, 12672, 15014]),
 (95, [44, 4, 20, 26, 16, 6, 75, 88, 84, 15]),
 (100, [313, 764, 636, 99, 119, 11, 252, 3, 63, 108]),
 (101, [837, 2314, 754, 624, 1290, 828, 282, 1051, 1207, 1477]),
 (102, [2904, 2843, 2283, 2284, 3208, 2585, 2958, 7055, 1795, 5724]),
 (104, [3, 9, 4, 119, 2, 299, 1, 252, 8, 41]),
 (114, [17, 27, 50, 37, 5, 111, 42, 433, 102, 293]),
 (124, [346, 1342, 1198, 715, 34, 94, 2590, 1387, 2963, 1704]),
 (127, [41, 40, 618, 39, 55, 11, 42, 2, 47, 50]),
 (138, [11, 28, 50, 1

In [100]:
def write_submission(submissions):
    with open("./submission.csv", "w") as f:
        f.write("user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")


In [101]:
write_submission(submission)