In [1]:
import os
os.chdir( "../")

In [2]:
import pandas as pd
import numpy as np

from Recommenders.Recommender_import_list import *
from Recommenders.Hybrid.GeneralizedPipelineHybridRecommender import GeneralizedPipelineHybridRecommender
from Evaluation.Evaluator import EvaluatorHoldout

path = "Dataset/data_train.csv"
df = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserID', 'ItemID', 'Interaction'])


df



Unnamed: 0,UserID,ItemID,Interaction
0,1,15,1.0
1,1,16,1.0
2,1,133,1.0
3,1,161,1.0
4,1,187,1.0
...,...,...,...
478724,13024,13605,1.0
478725,13024,13823,1.0
478726,13024,15122,1.0
478727,13024,18185,1.0


In [3]:
df.Interaction.value_counts()

1.0    478729
Name: Interaction, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478729 entries, 0 to 478728
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   UserID       478729 non-null  int64  
 1   ItemID       478729 non-null  int64  
 2   Interaction  478729 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 11.0 MB


In [5]:
user_ids = df["UserID"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
item_ids = df["ItemID"].unique().tolist()
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
df["User"] = df["UserID"].map(user2user_encoded)
df["Item"] = df["ItemID"].map(item2item_encoded)

num_users = len(user2user_encoded)
num_items = len(item_encoded2item)
df["Interaction"] = df["Interaction"].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = 0.0
max_rating = max(df["Interaction"])

print(
    "Number of users: {}, Number of Items: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_items, min_rating, max_rating
    )
)

Number of users: 12638, Number of Items: 22222, Min rating: 0.0, Max rating: 1.0


In [6]:
df.head()

Unnamed: 0,UserID,ItemID,Interaction,User,Item
0,1,15,1.0,0,0
1,1,16,1.0,0,1
2,1,133,1.0,0,2
3,1,161,1.0,0,3
4,1,187,1.0,0,4


In [7]:
userId_unique = df["UserID"].unique()
itemId_unique = df["ItemID"].unique()

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse as sps
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample



urm_all = sps.coo_matrix((df["Interaction"].values, 
                          (df["User"].values, df["Item"].values)))

urm_train_validation, urm_test = split_train_in_two_percentage_global_sample(urm_all, train_percentage = 0.80)
urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train_validation, train_percentage = 0.80)



In [9]:
num_users = len(userId_unique)
num_items = len(itemId_unique)

In [10]:

evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=[10], ignore_users=[])

EvaluatorHoldout: Ignoring 2602 (20.6%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 0 Users


**CREATING HYBRID**


## Insert model here

In [11]:
SLIMEN_best_params = {'topK': 7693, 'l1_ratio': 0.08013148517795793, 'alpha': 0.0012244028139782985}
RP3Beta_best_params = {'topK': 41,
 'alpha': 0.24025759098180052,
 'beta': 0.21463311953617964,
 'normalize_similarity': True}
EASE_best_params = {'topK':None, 'normalize_matrix':False,'l2_norm':84.03422929536671}
ItemKNN_best_params = {'topK': 23, 'shrink': 18, 'similarity': 'tversky', 'normalize': False}
IALS_best_params = {'num_factors': 184, 'epochs': 110, 'confidence_scaling': 'linear', 'alpha': 13.161328184474756, 'epsilon': 0.2917133297273583, 'reg': 0.0005872701636540686}
SLIMBPR_best_params = {'topK': 5, 'epochs': 60, 'symmetric': False, 'sgd_mode': 'adagrad', 'lambda_i': 1e-05, 'lambda_j': 1e-05, 'learning_rate': 0.1}

In [12]:
model = GeneralizedPipelineHybridRecommender

#def __init__(self, URM_train, KNN_recommender, recommenders: list, verbose=True, KNN_Under_Interactions=2):

In [13]:
pipeline_rec = model(urm_train, RP3betaRecommender, RP3Beta_best_params, SLIMElasticNetRecommender, SLIMEN_best_params, threshold=2)

RP3betaRecommender_SLIMElasticNetRecommenderPipelineHybridRecommender: URM Detected 472 ( 3.7%) users with no interactions.
RP3betaRecommender_SLIMElasticNetRecommenderPipelineHybridRecommender: URM Detected 341 ( 1.5%) items with no interactions.


In [14]:
pipeline_rec.fit()

URM Train Sparsity of the CSR matrix: 0.9989090435294288, #non null: 306386
RP3betaRecommender: URM Detected 472 ( 3.7%) users with no interactions.
RP3betaRecommender: URM Detected 341 ( 1.5%) items with no interactions.
RP3betaRecommender: Similarity column 22222 (100.0%), 4325.95 column/sec. Elapsed time 5.14 sec
Ended training of the first model
331662
URM Pipeline Sparsity of the CSR matrix: 0.9988190426294198, #non null: 331662
SLIMElasticNetRecommender: URM Detected 341 ( 1.5%) items with no interactions.
Start training of the second model
SLIMElasticNetRecommender: Processed 13826 (62.2%) in 5.00 min. Items per second: 46.08
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 7.95 min. Items per second: 46.60
Ended training of the second model


In [17]:
evaluator_validation.evaluateRecommender(pipeline_rec)

EvaluatorHoldout: Processed 10036 (100.0%) in 7.57 sec. Users per second: 1326


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
 cutoff                                                                      
 10      0.067617                 0.136327  0.122704  0.030829    0.062304   
 
             MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                       ...   
 10      0.20499  0.115492  0.087188  0.443105      0.250207  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 10          0.794113          0.351875    0.794113       0.041126   
 
        SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL RATIO_DIVERSITY_GINI  \
 cutoff                                                                   
 10            9.697152                   0.996415              0.11916   
 
        RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY RATIO_NOVELTY  
 cutoff                                      

In [20]:
pipeline_rec.recommend(0)[:10]

[674, 587, 818, 2023, 3, 1478, 1449, 1585, 677, 1118]