In [1]:
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from Utils.recsys2022DataReader import *

  from pandas import MultiIndex, Int64Index


In [2]:
URM_init = load_BinURMTrainInit()
URM_train = load_1K_BinURMTrain()[0]
URM_validation = load_1K_BinURMValid()[0]
URM_test = load_BinURMTest()

In [3]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender

candidate_generator_recommender = RP3betaRecommender(URM_train)
candidate_generator_recommender.fit(topK=50, alpha=0.7179592867915304, beta=0.3290994149353332)

RP3betaRecommender: Similarity column 24507 (100.0%), 677.34 column/sec. Elapsed time 36.18 sec


## Building dataframe

In [4]:
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sps
import numpy as np
from xgboost import XGBRanker

n_users, n_items = URM_train.shape

training_dataframe = pd.DataFrame(index=range(0,n_users), columns = ["ItemID"])
training_dataframe.index.name='UserID'

In [5]:
training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,
1,
2,
3,
4,
...,...
41624,
41625,
41626,
41627,


In [6]:
cutoff = 30

for user_id in tqdm(range(n_users)):
    recommendations = candidate_generator_recommender.recommend(user_id, cutoff = cutoff)
    training_dataframe.loc[user_id, "ItemID"] = recommendations

100%|██████████| 41629/41629 [00:33<00:00, 1246.24it/s]


In [7]:
training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,"[5735, 20, 391, 23, 828, 29, 2495, 752, 25, 61..."
1,"[6874, 20379, 665, 249, 13968, 9636, 14363, 11..."
2,"[453, 121, 7864, 617, 126, 676, 391, 15454, 67..."
3,"[20, 58, 352, 269, 22, 393, 1746, 1967, 61, 59..."
4,"[23, 25, 124, 53, 21, 353, 80, 71, 436, 67, 68..."
...,...
41624,"[14400, 22403, 7864, 10221, 14809, 18939, 1155..."
41625,"[56, 53, 1032, 17589, 21, 1026, 23, 1449, 1965..."
41626,"[56, 24064, 15229, 1693, 14551, 3535, 61, 2254..."
41627,"[1214, 1514, 1859, 4577, 1754, 19705, 5070, 50..."


In [8]:
training_dataframe = training_dataframe.explode("ItemID")
training_dataframe

Unnamed: 0_level_0,ItemID
UserID,Unnamed: 1_level_1
0,5735
0,20
0,391
0,23
0,828
...,...
41628,6703
41628,21722
41628,19424
41628,22181


## Add validation test

In [9]:
URM_validation_coo = sps.coo_matrix(URM_validation)

correct_recommendations = pd.DataFrame({"UserID": URM_validation_coo.row,
                                        "ItemID": URM_validation_coo.col})
correct_recommendations

Unnamed: 0,UserID,ItemID
0,0,575
1,0,4337
2,0,6351
3,0,10682
4,0,16868
...,...,...
198212,41628,5878
198213,41628,6219
198214,41628,9000
198215,41628,14567


In [10]:
training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
training_dataframe

Unnamed: 0,UserID,ItemID,Exist
0,0,5735,left_only
1,0,20,left_only
2,0,391,left_only
3,0,23,left_only
4,0,828,left_only
...,...,...,...
1248865,41628,6703,left_only
1248866,41628,21722,left_only
1248867,41628,19424,left_only
1248868,41628,22181,left_only


In [11]:
training_dataframe["Label"] = training_dataframe["Exist"] == "both"
training_dataframe.drop(columns = ['Exist'], inplace=True)
training_dataframe

Unnamed: 0,UserID,ItemID,Label
0,0,5735,False
1,0,20,False
2,0,391,False
3,0,23,False
4,0,828,False
...,...,...,...
1248865,41628,6703,False
1248866,41628,21722,False
1248867,41628,19424,False
1248868,41628,22181,False


## Now we add some features like different recommenders predictions

In [12]:
from Recommenders.KNN.ItemKNNCFRecommenderPLUS import ItemKNNCFRecommender
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender
from Recommenders.Implicit.ImplicitALSRecommender import ImplicitALSRecommender

ICM = createSmallICM()

ItemKNN = ItemKNNCFRecommender(URM_train)
ItemKNN.fit(ICM=ICM, topK=584, shrink=919, similarity="dice", normalization="bm25")

P3alpha = P3alphaRecommender(URM_train)
P3alpha.fit(topK=116, alpha=0.8763131065621229)

IALS = ImplicitALSRecommender(URM_train)
IALS.fit(factors=110, alpha=7, iterations=57, regularization=0.0008866558623568822)

other_algorithms = {
    "ItemKNN": ItemKNN,
    "P3alpha": P3alpha,
    "IALS": IALS,
}

Done: 100%|██████████| 24507/24507 [00:03<00:00, 7276.62it/s]            


P3alphaRecommender: Similarity column 24507 (100.0%), 1352.64 column/sec. Elapsed time 18.12 sec


  0%|          | 0/57 [00:00<?, ?it/s]

In [13]:
training_dataframe = training_dataframe.set_index('UserID')

for user_id in tqdm(range(n_users)):
    for rec_label, rec_instance in other_algorithms.items():

        item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()

        all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

        training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list]

training_dataframe = training_dataframe.reset_index()
training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
training_dataframe

100%|██████████| 41629/41629 [03:28<00:00, 199.18it/s]


Unnamed: 0,UserID,ItemID,Label,ItemKNN,P3alpha,IALS
0,0,5735,False,1.886544,0.038573,0.411152
1,0,20,False,2.157014,0.032121,0.427384
2,0,391,False,1.810423,0.032799,0.355383
3,0,23,False,2.124116,0.050202,0.671742
4,0,828,False,1.957499,0.024736,0.328565
...,...,...,...,...,...,...
1248865,41628,6703,False,0.391042,0.008965,-0.006666
1248866,41628,21722,False,0.394461,0.008719,0.006177
1248867,41628,19424,False,0.508864,0.008857,-0.001170
1248868,41628,22181,False,0.463089,0.010560,0.009168


## We add the item popularity and profile length

In [14]:
item_popularity = np.ediff1d(sps.csc_matrix(URM_train).indptr)

training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]
training_dataframe

Unnamed: 0,UserID,ItemID,Label,ItemKNN,P3alpha,IALS,item_popularity
0,0,5735,False,1.886544,0.038573,0.411152,324
1,0,20,False,2.157014,0.032121,0.427384,3598
2,0,391,False,1.810423,0.032799,0.355383,279
3,0,23,False,2.124116,0.050202,0.671742,3121
4,0,828,False,1.957499,0.024736,0.328565,825
...,...,...,...,...,...,...,...
1248865,41628,6703,False,0.391042,0.008965,-0.006666,24
1248866,41628,21722,False,0.394461,0.008719,0.006177,23
1248867,41628,19424,False,0.508864,0.008857,-0.001170,16
1248868,41628,22181,False,0.463089,0.010560,0.009168,21


In [15]:
user_popularity = np.ediff1d(sps.csr_matrix(URM_train).indptr)

training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]
training_dataframe

Unnamed: 0,UserID,ItemID,Label,ItemKNN,P3alpha,IALS,item_popularity,user_profile_len
0,0,5735,False,1.886544,0.038573,0.411152,324,47
1,0,20,False,2.157014,0.032121,0.427384,3598,47
2,0,391,False,1.810423,0.032799,0.355383,279,47
3,0,23,False,2.124116,0.050202,0.671742,3121,47
4,0,828,False,1.957499,0.024736,0.328565,825,47
...,...,...,...,...,...,...,...,...
1248865,41628,6703,False,0.391042,0.008965,-0.006666,24,14
1248866,41628,21722,False,0.394461,0.008719,0.006177,23,14
1248867,41628,19424,False,0.508864,0.008857,-0.001170,16,14
1248868,41628,22181,False,0.463089,0.010560,0.009168,21,14


In [16]:
training_dataframe = training_dataframe.sort_values("UserID").reset_index()
training_dataframe.drop(columns = ['index'], inplace=True)
training_dataframe

Unnamed: 0,UserID,ItemID,Label,ItemKNN,P3alpha,IALS,item_popularity,user_profile_len
0,0,5735,False,1.886544,0.038573,0.411152,324,47
1,0,3391,False,0.927965,0.021100,0.200136,212,47
2,0,1648,False,1.105758,0.000000,-0.171304,1370,47
3,0,4577,False,0.934707,0.022408,0.198725,223,47
4,0,5070,False,0.870980,0.021523,0.221970,195,47
...,...,...,...,...,...,...,...,...
1248865,41628,2904,False,1.639046,0.019774,0.186203,88,14
1248866,41628,769,False,2.373099,0.034845,0.266561,117,14
1248867,41628,22181,False,0.463089,0.010560,0.009168,21,14
1248868,41628,897,False,0.569218,0.007158,0.177831,101,14


## Train XGBoost

In [17]:
groups = training_dataframe.groupby("UserID").size().values
groups

array([30, 30, 30, ..., 30, 30, 30], dtype=int64)

In [24]:
from xgboost import XGBRanker

In [25]:
y_train = training_dataframe["Label"]
X_train = training_dataframe.drop(columns=["Label"])

X_train["UserID"] = X_train["UserID"].astype("category")
X_train["ItemID"] = X_train["ItemID"].astype("category")

In [28]:
n_estimators = 50
learning_rate = 1e-1
reg_alpha = 1e-1
reg_lambda = 1e-1
max_depth = 5
max_leaves = 0
grow_policy = "depthwise"
objective = "pairwise"
booster = "gbtree"
use_user_profile = False
random_seed = None

XGB_model = XGBRanker(objective='rank:{}'.format(objective),
                      n_estimators = int(n_estimators),
                      random_state = random_seed,
                      learning_rate = learning_rate,
                      reg_alpha = reg_alpha,
                      reg_lambda = reg_lambda,
                      max_depth = int(max_depth),
                      max_leaves = int(max_leaves),
                      grow_policy = grow_policy,
                      verbosity = 0, # 2 if self.verbose else 0,
                      booster = booster,
                      enable_categorical = True,
                      tree_method = "hist"  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
                      )

XGB_model.fit(X_train,
          y_train,
          group=groups,
          verbose=True)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


ValueError: Experimental support for categorical data is not implemented for current tree method yet.