In [2]:
#!pip install -r requirements.txt
!pip install catboost

Collecting catboost
  Using cached catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
Collecting graphviz
  Using cached graphviz-0.20.1-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1
[0m

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from matplotlib import pyplot
import recmetrics
import sys
import pprint
import tqdm
import catboost

from prediction import LtrPrediction
from processing import DataPreprocessing
import ltr_utils as ut

## Data Sampling

In [4]:
dv_train = pd.read_csv('recsys_data/train.csv')
x = dv_train[dv_train['response'] == 1].groupby('ip').size().reset_index(name='num_clicks').sort_values(by=['num_clicks'], ascending=False)['ip'].head(100)
df_ip = pd.DataFrame({'ip':x})
dv_train = pd.merge(dv_train,df_ip,on='ip')

In [5]:
dv_train.nunique()

ip            100
event_date     10
tcm_id         66
response        2
dtype: int64

In [6]:
len(dv_train)

8090

In [7]:
dv_test = pd.read_csv('recsys_data/test_all.csv')
dv_test = pd.merge(dv_test,df_ip,on='ip')

In [8]:
dv_test = dv_test.drop(['Unnamed: 0'], axis=1)
dv_test.nunique()

ip            100
event_date      1
tcm_id         66
response        2
dtype: int64

In [9]:
len(dv_test)

6600

In [10]:
dv_test = dv_test.head(1000)
dv_train = dv_train.head(1000)

In [11]:
dv_train.to_csv('outputs/sample_train.csv')
dv_test.to_csv('outputs/sample_test.csv')

## Data Preprocessing

In [12]:
data_dict = {
    'train' : 'outputs/sample_train.csv',
    'test' : 'outputs/sample_test.csv',
    'articles' : 'recsys_data/articles.csv',
    'users' : 'recsys_data/users.csv'
}

In [13]:
DataPrep = DataPreprocessing(data_dict)

(458, 19) (869, 19)


In [14]:
DataPrep.init_KG('outputs/KGs/KG_dep_parsing_100terms.txt')

In [15]:
DataPrep.init_SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
train_data_df, test_data_df = DataPrep.fit_data_pipeline()

100%|██████████| 869/869 [00:31<00:00, 27.98it/s]
100%|██████████| 869/869 [00:06<00:00, 140.50it/s]
100%|██████████| 458/458 [00:14<00:00, 32.27it/s]
100%|██████████| 458/458 [00:03<00:00, 131.22it/s]


In [17]:
processed_data_dict = DataPrep.get_train_test_split(['art','usr','kg','st'])

(869, 151) (869, 1) (458, 151) (458, 1)


In [18]:
X_train, y_train, groups_train, qid_train, X_test, y_test, qid_test, groups_test = processed_data_dict['train_test']
test_data = processed_data_dict['test_data']

## Model Training

In [21]:
train = catboost.Pool(
        data=X_train,
        label=y_train,
        group_id=qid_train
    )

test = catboost.Pool(
    data=X_test,
    label=y_test,
    group_id=qid_test
)

parameters  = {
    "task_type":"GPU",
    "has_time":True,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
    "loss_function":'PairLogitPairwise',
    'custom_metric': ['MAP:top=10'],

} 

model = catboost.CatBoostRanker(**parameters)
model.fit(train, eval_set=test, plot=True,verbose=False)

test_score = np.mean(model.get_evals_result()["learn"]['MAP:top=10'])

print('default Test  :  ',test_score)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because MAP is/are not implemented for GPU
Metric MAP:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


default Test  :   0.9235080549632788


## Hyperparameter Tuning

In [22]:
def objective(trial):

        metric = "map@10"

        param = {
                "task_type":"GPU",
                "has_time":True,
                "save_snapshot":False,
                "use_best_model":True, # requires eval set to be set
                "loss_function":'PairLogitPairwise',
                'custom_metric': ['MAP:top=10'],
                "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e0),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 1e0),
                "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
                "depth": trial.suggest_int("depth", 1, 8),
                # "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
            }
        
        gbm = catboost.CatBoostRanker(**param)

        # pruning_callback = optuna.integration.CatBoostPruningCallback(trial, "MAP:top=10")
        gbm.fit(train, 
            eval_set=test,
            verbose=0,
            early_stopping_rounds=100,
            # callbacks=[pruning_callback],
        )


        test_score = np.mean(gbm.get_evals_result()["learn"]['MAP:top=10'])

        return test_score

# optuna.logging.WARNING
optuna.logging.set_verbosity(optuna.logging.WARNING)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=20)
study = optuna.create_study(pruner=pruner, direction='maximize')
study.optimize(objective, n_trials=50)

RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
Default metric period is 5 because MAP is/are not implemented for GPU
Metric MAP:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affec

In [23]:
print(f"Optimized MAP: {study.best_value:.4f}")
print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

Optimized MAP: 0.9573
Best params:
	learning_rate: 0.9444081577388139
	l2_leaf_reg: 0.44684282200200404
	colsample_bylevel: 0.03693837102533545
	depth: 8
	min_data_in_leaf: 19


In [24]:
d_param = {
            "task_type":"GPU",
            "has_time":True,
            "save_snapshot":False,
            "use_best_model":True, # requires eval set to be set
            "loss_function":'PairLogitPairwise',
            'custom_metric': ['MAP:top=10']}
best_params = study.best_params
best_params.update(d_param)

bst_gbm = catboost.CatBoostRanker(**best_params)


bst_gbm.fit(train, 
    eval_set=test,
    verbose=0,
    early_stopping_rounds=100
)

test_score = np.mean(bst_gbm.get_evals_result()["learn"]['MAP:top=10'])
print('Optimized Test :  ',test_score)

RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
RSM on GPU will work only for non-binary features. Plus current implementation will sample by groups, so this could slightly affect quality in positive or negative way
Default metric period is 5 because MAP is/are not implemented for GPU
Metric MAP:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Optimized Test :   0.957761643118786


## Model Evaluation

In [26]:
xgb_Prediction = LtrPrediction(bst_gbm, test_data)

In [27]:
results = xgb_Prediction.evaluate()
results

{'MAP@10': 0.18659737969261778,
 'Prediction_Coverage': 52.94,
 'Catalog_Coverage': 52.94,
 'Novelty': 0.36456050799499634,
 'personalization': 0.500952380952381}

In [28]:
top_k_best_score_ips = xgb_Prediction.get_recomendation()

In [29]:
tcm_le = DataPrep.return_inverse_transform()
top_k_best_score_ips['article_actual_ids'] = top_k_best_score_ips.apply(lambda row: tcm_le.inverse_transform(row['article_actual']) ,axis=1)
top_k_best_score_ips['article_recomended_ids'] = top_k_best_score_ips.apply(lambda row: tcm_le.inverse_transform(row['article_prediction']) ,axis=1)

In [30]:
pd.set_option('max_columns', None)
pd.set_option('display.max_colwidth', None)
top_k_best_score_ips[['article_actual_ids','article_recomended_ids','apk']].sort_values(by='apk',ascending = False).head(10)

Unnamed: 0_level_0,article_actual_ids,article_recomended_ids,apk
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
503477,"[tcm:526-656536, tcm:526-12180, tcm:526-674187, tcm:526-681575, tcm:526-16645, tcm:526-208086, tcm:526-209028, tcm:526-22867, tcm:526-273898, tcm:526-40469]","[tcm:526-22867, tcm:526-40469, tcm:526-209028, tcm:526-16645, tcm:526-551763, tcm:526-187302, tcm:526-273898, tcm:526-261006, tcm:526-682379, tcm:526-12174]",0.471429
452530,"[tcm:526-22867, tcm:526-10838, tcm:526-209028, tcm:526-712788]","[tcm:526-22867, tcm:526-187302, tcm:526-16645, tcm:526-209028, tcm:526-273898, tcm:526-40469, tcm:526-551763, tcm:526-10838, tcm:526-12174, tcm:526-261006]",0.46875
1602525,"[tcm:526-116442, tcm:526-16645, tcm:526-187302, tcm:526-209028, tcm:526-224772, tcm:526-249037, tcm:526-40469, tcm:526-46242]","[tcm:526-22867, tcm:526-40469, tcm:526-209028, tcm:526-16645, tcm:526-551763, tcm:526-187302, tcm:526-273898, tcm:526-261006, tcm:526-682379, tcm:526-12174]",0.322917
395075,"[tcm:526-12167, tcm:526-674187, tcm:526-149043, tcm:526-16645, tcm:526-18025, tcm:526-209028, tcm:526-261006, tcm:526-273898, tcm:526-276414]","[tcm:526-22867, tcm:526-209028, tcm:526-40469, tcm:526-273898, tcm:526-149043, tcm:526-16645, tcm:526-551763, tcm:526-187302, tcm:526-385364, tcm:526-276414]",0.307407
1523358,"[tcm:526-10838, tcm:526-656536, tcm:526-681575, tcm:526-688510, tcm:526-707567, tcm:526-224772, tcm:526-22867, tcm:526-46242]","[tcm:526-22867, tcm:526-10838, tcm:526-16645, tcm:526-40469, tcm:526-209028, tcm:526-273898, tcm:526-187302, tcm:526-551763, tcm:526-261006, tcm:526-385364]",0.25
1356756,"[tcm:526-13782, tcm:526-16645, tcm:526-684643, tcm:526-199040, tcm:526-231792, tcm:526-80807, tcm:526-40469]","[tcm:526-80807, tcm:526-388698, tcm:526-707567, tcm:526-682379, tcm:526-12174, tcm:526-385364, tcm:526-13782, tcm:526-261006, tcm:526-684643, tcm:526-674187]",0.231293
223190,"[tcm:526-551763, tcm:526-191289, tcm:526-707567, tcm:526-22867, tcm:526-231792, tcm:526-244935, tcm:526-249037, tcm:526-276414]","[tcm:526-276414, tcm:526-12174, tcm:526-682379, tcm:526-385364, tcm:526-261006, tcm:526-707567, tcm:526-80807, tcm:526-388698, tcm:526-70899, tcm:526-782343]",0.166667
379682,"[tcm:526-12167, tcm:526-13782, tcm:526-149043, tcm:526-784445, tcm:526-244935, tcm:526-273898, tcm:526-346380]","[tcm:526-22867, tcm:526-209028, tcm:526-149043, tcm:526-551763, tcm:526-273898, tcm:526-187302, tcm:526-16645, tcm:526-40469, tcm:526-13782, tcm:526-385364]",0.152381
60747,"[tcm:526-551763, tcm:526-12174, tcm:526-149043, tcm:526-681575, tcm:526-682576, tcm:526-18025, tcm:526-93195, tcm:526-259315, tcm:526-276414]","[tcm:526-80807, tcm:526-388698, tcm:526-707567, tcm:526-682379, tcm:526-12174, tcm:526-385364, tcm:526-276414, tcm:526-18025, tcm:526-674187, tcm:526-681575]",0.140079
47811,"[tcm:526-10787, tcm:526-670542, tcm:526-18025, tcm:526-684643, tcm:526-191289, tcm:526-239640, tcm:526-244935, tcm:526-418532]","[tcm:526-385364, tcm:526-261006, tcm:526-670542, tcm:526-682379, tcm:526-684643, tcm:526-707567, tcm:526-388698, tcm:526-80807, tcm:526-12180, tcm:526-70899]",0.091667
