# Odporúčanie

In [56]:
# dependencies
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import wandb
from wandb.xgboost import wandb_callback

from _ import constants
from _.functions import preprocess

Načítame predspracované dáta (prípadne predspracujeme)

In [5]:
# preprocess data if not exists
if not constants.PREPROCESSED.exists():
    df = preprocess(constants.DROPPED, constants.TRAIN, constants.METADATA,)
    df.to_parquet(constants.PREPROCESSED, index=False)
else:
    df = pd.read_parquet(constants.PREPROCESSED)

Nainicializujeme Weights and Biases (logovací nástroj pre strojové učenie)

In [8]:
wandb.login()

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/feri/.netrc


True

Rozdelíme dátovú sadu na trénovaciu a validačnú

In [9]:
df_train, df_valid = train_test_split(df, test_size=0.2)

In [10]:
df_train

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,current_filters,impressions,prices,impression_index,price,mean_price,device_desktop,device_mobile,device_tablet
5031312,3FWYPHE85TRC,101198,1541380615,36,4,3183561,2,4118,,,,,,,0,1,0
535973,WV5XYLK2TWQA,428060,1541121990,8,1,1668615,7,4706,[Best Value],"[1832001, 1668615, 1391336, 2569642, 1241054, ...","[29, 30, 26, 37, 37, 33, 37, 45, 30, 36, 78, 5...",1.0,30.0,42.04,1,0,0
3882567,87QQ3SHBP6LY,291003,1541099566,37,4,104808,7,26967,,,,,,,0,1,0
5477711,TU89APV44P9G,444566,1541531184,29,4,477751,12,5049,,,,,,,1,0,0
37370,LAVM6HJ6PD4B,104538,1541383802,37,4,4198568,7,16379,,,,,,,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8800092,HPVGSX1Q50FS,415858,1541373073,26,4,54117,8,11459,,,,,,,1,0,0
4478541,14YVK1DETU6H,384346,1541106063,15,4,2396262,50,16226,,,,,,,0,1,0
3973798,5NJ7Q9WFU53W,368273,1541106621,38,4,3972924,51,9044,,,,,,,1,0,0
1425355,LA70UI16N258,581922,1541178420,33,4,2937322,14,1827,,,,,,,1,0,0


Vytvoríme DMatrix (vstupný formát pre XGBoost)

In [37]:
def getDMatrix(df, label, labels_to_drop):
    data = df.drop(labels_to_drop, axis=1)
    data.fillna("0", inplace=True)
    
    for col in list(set(data.columns.values.tolist())):
        try:
            data[col] = pd.to_numeric(data[col])
        except:
            data[col] = -1


    return xgb.DMatrix(
        data=data,
        label=data[label].values,
        nthread=-1,
        feature_names=data.columns.tolist(),
    )

In [42]:
label = 'device_desktop'

labels_to_drop = [
    'user_id',
    'session_id', 
    'step',
]

dm_train = getDMatrix(df_train, label, labels_to_drop)
dm_valid = getDMatrix(df_valid, label, labels_to_drop)

Spustíme trénovanie

In [43]:
params={
  'eta': 0.02, 
  'booster': 'gbtree',
  'tree_method':'hist',
  'max_leaves': 350,
  'max_depth': 10,
  'nthread': -1,
  'subsample': 0.9,
  'colsample_bytree': 0.8,
  'colsample_bylevel': 0.8,
  'min_child_weight': 2,
  'alpha': 1,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'random_state': 5478,
  'verbosity': 0,
}


run = 1
notes ='Initial training'
wandb_run = wandb.init(project="dp-xgboost-rec",name=f'run_{run}', notes=notes)
wandb_run.config.update(params)

clf = xgb.train(
    params=params,
    dtrain=dm_train,
    num_boost_round=300, #11927
    early_stopping_rounds=10,
    evals=[ (dm_train, 'train'), (dm_valid, 'valid')],
    verbose_eval=10,
    callbacks=[wandb_callback()]
)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…



Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.
Will train until valid-logloss hasn't improved in 10 rounds.
0	train-logloss:0.68110	valid-logloss:0.67226
10	train-logloss:0.52744	valid-logloss:0.52479
20	train-logloss:0.40623	valid-logloss:0.40613
30	train-logloss:0.32536	valid-logloss:0.32539
40	train-logloss:0.25789	valid-logloss:0.25785
50	train-logloss:0.20502	valid-logloss:0.20575
60	train-logloss:0.16859	valid-logloss:0.16826
70	train-logloss:0.13606	valid-logloss:0.13624
80	train-logloss:0.11072	valid-logloss:0.11068
90	train-logloss:0.09310	valid-logloss:0.09303
100	train-logloss:0.07845	valid-logloss:0.07841
110	train-logloss:0.06376	valid-logloss:0.06375
120	train-logloss:0.05190	valid-logloss:0.05188
130	train-logloss:0.04227	valid-logloss:0.04226
140	train-logloss:0.03462	valid-logloss:0.03463
150	train-logloss:0.02825	valid-logloss:0.02826
160	train-logloss:0.02333	valid-logloss:0.02332
170	train-logloss:0.01939	valid-logloss:0.019

Vypočítame si MRR na dátach

In [57]:
def compute_mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

def evaluate(df_valid, dm_valid, label, clf):
    df_valid.loc[:,'scores'] = clf.predict(dm_valid)[:df_valid.shape[0]]
    groups = df_valid.groupby(by=['session_id'])
    rss = []
    
    for _, g in groups:
        scores = g.scores
        sorted_arg = np.flip(np.argsort(scores))
        rss.append(g[label].values[sorted_arg])

    mrr = compute_mean_reciprocal_rank(rss)
    return mrr

mrr = evaluate(df_valid, dm_valid, label, clf)

print("MRR score: ", mrr)
wandb.log({"mrr":mrr})

MRR score:  0.3660414119012581


Zobrazíme si dôležitosť jednotlivých atribútov na rozhodovanie algoritmu XGBoost

In [61]:
imp = clf.get_score( importance_type='gain')
imp_df = pd.DataFrame.from_dict(imp, orient='index').reset_index()

imp_df.columns=['name','importance']
imp_df.sort_values('importance', ascending=False, inplace=True)

print(imp_df.head(20))
wandb.log({"importance":dict(imp_df[:10].values)})

               name     importance
7    device_desktop  637333.250000
8     device_mobile  422360.437500
9     device_tablet   76941.710938
0         timestamp     154.500259
2          platform     133.275635
1       action_type      82.746422
6        mean_price      79.039299
5             price      35.720272
3              city      12.827970
4  impression_index       5.405089


In [70]:
clf.save_model(constants.DATA_DIR / f'run_{run}.model')
wandb_run.finish()