In [35]:
#%cd /home/dado/recsys/RepoJ/RecSys-Challenge/

In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import scipy.sparse as sps
from xgboost import XGBRanker, plot_importance
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from sklearn.model_selection import train_test_split
from Recommenders.Recommender_import_list import *
from Evaluation.Evaluator import EvaluatorHoldout
from xgboost import plot_importance

from Recommenders.MatrixFactorization.ImplicitIALSRecommender import ImplicitALSRecommender


In [37]:
path = "Dataset/data_train.csv"
df = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserID', 'ItemID', 'Interaction'])

target_users = pd.read_csv("Dataset/data_target_users_test.csv")
target_users.columns = ["UserID"]
tar_users = target_users["UserID"].astype(int)

In [38]:
user_ids = df["UserID"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
item_ids = df["ItemID"].unique().tolist()
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
df["User"] = df["UserID"].map(user2user_encoded)
df["Item"] = df["ItemID"].map(item2item_encoded)

num_users = len(user2user_encoded)
num_items = len(item_encoded2item)
df["Interaction"] = df["Interaction"].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = 0.0
max_rating = max(df["Interaction"])

print(
    "Number of users: {}, Number of Items: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_items, min_rating, max_rating
    )
)

userId_unique = df["UserID"].unique()
itemId_unique = df["ItemID"].unique()

Number of users: 12638, Number of Items: 22222, Min rating: 0.0, Max rating: 1.0


In [39]:
URM_all = sps.coo_matrix((df["Interaction"].values, 
                          (df["User"].values, df["Item"].values)))

URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.80)



In [40]:
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users=[])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10], ignore_users=[])

EvaluatorHoldout: Ignoring 2602 (20.6%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 0 Users
EvaluatorHoldout: Ignoring 2128 (16.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 0 Users


In [34]:
from Recommenders.SLIM.SLIMElasticNetRecommender import MultiThreadSLIM_SLIMElasticNetRecommender

topPop = TopPop(URM_train)
topPop.fit()

knn = ItemKNNCFRecommender(URM_train)
knn.fit(topK=19, shrink=28, similarity="tversky", normalize=True, tversky_alpha=0.0, tversky_beta=1.3672226785339947)

rp3beta = RP3betaRecommender(URM_train, verbose=True)
rp3beta.fit(topK=29, alpha=0.33723025040409343, beta=0.15542352567862933, normalize_similarity=True)

slimen = MultiThreadSLIM_SLIMElasticNetRecommender(URM_train,verbose=True)
slimen.fit(workers=8,topK=638, l1_ratio=0.026639245044358113, alpha=0.001535792379351249)


ials = ImplicitALSRecommender(URM_train, verbose=True)
ials.fit(use_gpu=True, num_threads=1, num_factors=128, epochs=478, confidence_scaling='linear', alpha=3.358295366834473, epsilon=7.295986722921583, reg=6.292360640597378e-05)

candidate_generator_recommender = GeneralizedLinearHybridRecommenderCold(URM_train, knn, [rp3beta, slimen, ials],True,1)
candidate_generator_recommender.fit([0.6464483146524337, 0.7533801103113421, 0.032576355148477586])

candidate_generator_recommender = slimen

other_algorithms = {
    #"TopPop": topPop,
    "KNN": knn,
    "RP3beta": rp3beta,
    "SLIMEN": slimen,
    "IALS": ials,
    "Hybrid": candidate_generator_recommender
}

TopPopRecommender: URM Detected 472 ( 3.7%) users with no interactions.
TopPopRecommender: URM Detected 341 ( 1.5%) items with no interactions.
ItemKNNCFRecommender: URM Detected 472 ( 3.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 341 ( 1.5%) items with no interactions.
Similarity column 22222 (100.0%), 14040.53 column/sec. Elapsed time 1.58 sec
RP3betaRecommender: URM Detected 472 ( 3.7%) users with no interactions.
RP3betaRecommender: URM Detected 341 ( 1.5%) items with no interactions.
RP3betaRecommender: Similarity column 22222 (100.0%), 9746.98 column/sec. Elapsed time 2.28 sec
SLIMElasticNetRecommender: URM Detected 472 ( 3.7%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 341 ( 1.5%) items with no interactions.


100%|█████████▉| 22216/22222 [00:43<00:00, 511.01it/s]

ImplicitALSRecommender: URM Detected 472 ( 3.7%) users with no interactions.
ImplicitALSRecommender: URM Detected 341 ( 1.5%) items with no interactions.
Using gpu: True





  0%|          | 0/478 [00:00<?, ?it/s]

RP3betaSLIMElasticNetImplicitALSHybridRecommender: URM Detected 472 ( 3.7%) users with no interactions.
RP3betaSLIMElasticNetImplicitALSHybridRecommender: URM Detected 341 ( 1.5%) items with no interactions.


In [41]:
n_users, n_items = URM_train.shape

training_dataframe = pd.DataFrame(index=range(0,n_users), columns = ["ItemID"])
training_dataframe.index.name='UserID'

training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,
1,
2,
3,
4,
...,...
12633,
12634,
12635,
12636,


In [42]:
cutoff = 30

for user_id in tqdm(range(n_users)):    
    recommendations = candidate_generator_recommender.recommend(user_id, cutoff = cutoff)
    training_dataframe.at[user_id, "ItemID"] = recommendations  # Encapsulate recommendations in a list

100%|██████████| 12638/12638 [00:04<00:00, 3135.64it/s]


In [43]:
training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,"[674, 587, 1585, 2023, 1478, 3, 1118, 818, 144..."
1,"[2548, 44, 84, 43, 3641, 841, 89, 1166, 897, 9..."
2,"[2229, 842, 841, 2545, 8304, 8303, 468, 4506, ..."
3,"[589, 89, 227, 812, 88, 284, 677, 5596, 819, 4..."
4,"[4709, 323, 950, 4316, 189, 285, 954, 952, 238..."
...,...
12633,"[2809, 2818, 2815, 587, 2811, 2807, 2814, 2813..."
12634,"[950, 954, 2807, 3963, 3948, 3120, 3958, 3932,..."
12635,"[248, 472, 258, 470, 259, 890, 986, 881, 4664,..."
12636,"[92, 2236, 689, 1559, 600, 332, 331, 1558, 681..."


In [44]:
training_dataframe = training_dataframe.explode("ItemID")
training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,674
0,587
0,1585
0,2023
0,1478
...,...
12637,16591
12637,16593
12637,809
12637,10368


In [45]:
URM_validation_coo = sps.coo_matrix(URM_validation)

correct_recommendations = pd.DataFrame({"UserID": URM_validation_coo.row,
                                        "ItemID": URM_validation_coo.col})
correct_recommendations



Unnamed: 0,UserID,ItemID
0,0,8
1,0,15
2,0,16
3,0,19
4,1,43
...,...,...
76592,12637,11115
76593,12637,13074
76594,12637,13249
76595,12637,16341


In [46]:
training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
training_dataframe

Unnamed: 0,UserID,ItemID,Exist
0,0,674,left_only
1,0,587,left_only
2,0,1585,left_only
3,0,2023,left_only
4,0,1478,left_only
...,...,...,...
379135,12637,16591,left_only
379136,12637,16593,left_only
379137,12637,809,left_only
379138,12637,10368,left_only


In [47]:
training_dataframe["Label"] = training_dataframe["Exist"] == "both"
training_dataframe.drop(columns = ['Exist'], inplace=True)
training_dataframe

Unnamed: 0,UserID,ItemID,Label
0,0,674,False
1,0,587,False
2,0,1585,False
3,0,2023,False
4,0,1478,False
...,...,...,...
379135,12637,16591,False
379136,12637,16593,False
379137,12637,809,False
379138,12637,10368,False


In [48]:
n_users, n_items = URM_all.shape

In [49]:
training_dataframe = training_dataframe.set_index('UserID')

for user_id in tqdm(range(n_users)):  
    for rec_label, rec_instance in other_algorithms.items():
        item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
        
        all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

        training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

training_dataframe = training_dataframe.reset_index()
training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
training_dataframe

100%|██████████| 12638/12638 [01:00<00:00, 209.34it/s]


Unnamed: 0,UserID,ItemID,Label,KNN,RP3beta,SLIMEN,IALS,Hybrid
0,0,674,False,0.811934,0.516371,0.372236,0.708303,0.372236
1,0,587,False,0.690037,0.302430,0.324214,0.579643,0.324214
2,0,1585,False,0.406154,0.214042,0.219835,0.384224,0.219835
3,0,2023,False,0.416344,0.151520,0.217326,0.301951,0.217326
4,0,1478,False,0.383737,0.164780,0.151011,0.247132,0.151011
...,...,...,...,...,...,...,...,...
379135,12637,16591,False,0.235171,0.145367,0.146674,0.074360,0.146674
379136,12637,16593,False,0.313323,0.164826,0.142931,0.244196,0.142931
379137,12637,809,False,0.282516,0.105856,0.140656,0.217648,0.140656
379138,12637,10368,False,0.245752,0.077340,0.139678,0.205239,0.139678


In [50]:
item_popularity = np.ediff1d(sps.csc_matrix(URM_train).indptr)

#training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]
training_dataframe

Unnamed: 0,UserID,ItemID,Label,KNN,RP3beta,SLIMEN,IALS,Hybrid
0,0,674,False,0.811934,0.516371,0.372236,0.708303,0.372236
1,0,587,False,0.690037,0.302430,0.324214,0.579643,0.324214
2,0,1585,False,0.406154,0.214042,0.219835,0.384224,0.219835
3,0,2023,False,0.416344,0.151520,0.217326,0.301951,0.217326
4,0,1478,False,0.383737,0.164780,0.151011,0.247132,0.151011
...,...,...,...,...,...,...,...,...
379135,12637,16591,False,0.235171,0.145367,0.146674,0.074360,0.146674
379136,12637,16593,False,0.313323,0.164826,0.142931,0.244196,0.142931
379137,12637,809,False,0.282516,0.105856,0.140656,0.217648,0.140656
379138,12637,10368,False,0.245752,0.077340,0.139678,0.205239,0.139678


In [51]:
user_popularity = np.ediff1d(sps.csr_matrix(URM_train).indptr)

#training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]
training_dataframe

Unnamed: 0,UserID,ItemID,Label,KNN,RP3beta,SLIMEN,IALS,Hybrid
0,0,674,False,0.811934,0.516371,0.372236,0.708303,0.372236
1,0,587,False,0.690037,0.302430,0.324214,0.579643,0.324214
2,0,1585,False,0.406154,0.214042,0.219835,0.384224,0.219835
3,0,2023,False,0.416344,0.151520,0.217326,0.301951,0.217326
4,0,1478,False,0.383737,0.164780,0.151011,0.247132,0.151011
...,...,...,...,...,...,...,...,...
379135,12637,16591,False,0.235171,0.145367,0.146674,0.074360,0.146674
379136,12637,16593,False,0.313323,0.164826,0.142931,0.244196,0.142931
379137,12637,809,False,0.282516,0.105856,0.140656,0.217648,0.140656
379138,12637,10368,False,0.245752,0.077340,0.139678,0.205239,0.139678


In [52]:


training_dataframe = training_dataframe.sort_values("UserID").reset_index()
training_dataframe.drop(columns = ['index'], inplace=True)
training_dataframe



Unnamed: 0,UserID,ItemID,Label,KNN,RP3beta,SLIMEN,IALS,Hybrid
0,0,674,False,0.811934,0.516371,0.372236,0.708303,0.372236
1,0,5586,False,0.072794,0.067193,0.070869,0.045354,0.070869
2,0,99,False,0.260970,0.098048,0.075955,0.417567,0.075955
3,0,401,False,0.098476,0.058922,0.076273,0.093326,0.076273
4,0,4427,False,0.076524,0.051042,0.076434,0.023911,0.076434
...,...,...,...,...,...,...,...,...
379135,12637,821,False,0.657416,0.278838,0.362668,0.376657,0.362668
379136,12637,338,True,1.118350,0.303830,0.433514,0.777902,0.433514
379137,12637,10368,False,0.245752,0.077340,0.139678,0.205239,0.139678
379138,12637,10906,False,0.583820,0.140867,0.185072,0.277011,0.185072


In [53]:
candidate_generator_recommender._compute_item_score([0])[0][674]

0.3722358

In [54]:
groups = training_dataframe.groupby("UserID").size().values
groups

array([30, 30, 30, ..., 30, 30, 30])

In [55]:
training_dataframe = training_dataframe.sort_values("UserID").reset_index()
training_dataframe.drop(columns = ['index'], inplace=True)
training_dataframe

Unnamed: 0,UserID,ItemID,Label,KNN,RP3beta,SLIMEN,IALS,Hybrid
0,0,674,False,0.811934,0.516371,0.372236,0.708303,0.372236
1,0,818,False,0.325339,0.120968,0.146128,0.328306,0.146128
2,0,710,False,0.168699,0.059478,0.109793,0.091757,0.109793
3,0,3708,False,0.232157,0.090953,0.112669,0.236288,0.112669
4,0,1270,False,0.146309,0.070214,0.119442,-0.018066,0.119442
...,...,...,...,...,...,...,...,...
379135,12637,3016,False,0.327518,0.100932,0.173775,0.404188,0.173775
379136,12637,371,False,0.471733,0.052784,0.178152,0.454080,0.178152
379137,12637,10906,False,0.583820,0.140867,0.185072,0.277011,0.185072
379138,12637,617,False,0.414747,0.202662,0.183385,0.140538,0.183385


In [56]:
y_train = training_dataframe["Label"]
X_train = training_dataframe.drop(columns=["Label"])
#X_train = training_dataframe.drop(columns=["TopPop"])

In [57]:
X_train["UserID"] = X_train["UserID"].astype("category")
X_train["ItemID"] = X_train["ItemID"].astype("category")

In [58]:
print(X_train.dtypes)

UserID     category
ItemID     category
KNN         float64
RP3beta     float64
SLIMEN      float64
IALS        float64
Hybrid      float64
dtype: object


In [59]:
from Recommenders.BaseRecommender import BaseRecommender

class XGBoostRecommender(BaseRecommender):

    RECOMMENDER_NAME = "XGBoostRecommender"

    def __init__(self,
                URM_train,
                recs,
                generator,
                X_train,
                y_train,
                verbose = True,
                n_estimators = 50,
                learning_rate = 1e-1,
                reg_alpha = 1e-1,
                reg_lambda = 1e-1,
                max_depth = 20,
                max_leaves = 0,
                grow_policy = "depthwise",
                objective = "pairwise",
                booster = "gbtree",
                random_seed = None,
                ):

        super(XGBoostRecommender, self).__init__(URM_train, verbose=verbose)

        self.XGB_model = XGBRanker(objective='rank:{}'.format(objective),
                                n_estimators = int(n_estimators),
                                random_state = random_seed,
                                learning_rate = learning_rate,
                                reg_alpha = reg_alpha,
                                reg_lambda = reg_lambda,
                                max_depth = int(max_depth),
                                max_leaves = int(max_leaves),
                                grow_policy = grow_policy,
                                verbosity = 0, # 2 if self.verbose else 0,
                                booster = booster,
                                enable_categorical = True,
                            )
        self.recommenders = recs
        self.main_recommender = generator
        self.X_train = X_train
        self.y_train = y_train
        
    def fit(self):
        self.XGB_model.fit(self.X_train,
                    self.y_train,
                    group=groups,
                    verbose=True)
    
    def get_URM_train(self):
        return self.URM_train

    def _compute_item_score(self, user_id_array, items_to_compute):
        item_weights = np.zeros((len(user_id_array), self.n_items))
        for i, user in enumerate(user_id_array):
            if user in target_users["UserID"].values-1:
                X_to_predict = self.X_train[self.X_train["UserID"] == user]
                scores = self.XGB_model.predict(X_to_predict)
                scores = (scores - scores.min())/(scores.max() - scores.min() + 1e-12)
                item_weights[i][X_to_predict["ItemID"]] = scores

        return item_weights

In [60]:
main_recommender = "Hybrid"
XGBoostRecommender = XGBoostRecommender(URM_train, other_algorithms, candidate_generator_recommender, X_train, y_train)

XGBoostRecommender: URM Detected 472 ( 3.7%) users with no interactions.
XGBoostRecommender: URM Detected 341 ( 1.5%) items with no interactions.


In [61]:
XGBoostRecommender.fit()

In [62]:
evaluation_res = evaluator_validation.evaluateRecommender(XGBoostRecommender)
evaluation_res

EvaluatorHoldout: Processed 10036 (100.0%) in 26.75 sec. Users per second: 375


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
 cutoff                                                                      
 10      0.101714                 0.178883  0.153885  0.100924    0.177886   
 
              MRR      NDCG        F1 HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                       ...   
 10      0.471081  0.239446  0.122475   0.4723      0.675258  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 10          0.794113          0.375059    0.794113       0.050257   
 
        SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL RATIO_DIVERSITY_GINI  \
 cutoff                                                                   
 10            9.728164                   0.996178             0.145616   
 
        RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY RATIO_NOVELTY  
 cutoff                                      

# **Recreating predictions**

In [63]:
item_popularity_encoded = np.ediff1d(URM_all.tocsc().indptr)
item_popularity_encoded = np.sort(item_popularity_encoded)

target_users = pd.read_csv("Dataset/data_target_users_test.csv")
target_users.columns = ["UserID"]

tar_users = target_users["UserID"].astype(int)
topPop_encoded = item_popularity_encoded[-10:]

In [64]:
submission = []
for index, user in tqdm(enumerate(tar_users)):
    if (user not in df["UserID"].values):
        item_list_encoded = topPop_encoded
    else:
        item_list_encoded = XGBoostRecommender.recommend(user2user_encoded[user])[:10]
    item_list = []
    for item_encoded in item_list_encoded:
        item_list.append(item_encoded2item[item_encoded])
    submission.append((user, item_list))

10882it [00:37, 287.78it/s]


In [65]:

def write_submission(submissions):
    with open("./submission_xgboost_potente.csv", "w") as f:
        f.write("user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")
            
write_submission(submission)