In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.2)

from IPython.display import clear_output
%matplotlib inline

import pandas as pd
import plotly.express as px
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from sklearn.metrics import accuracy_score, roc_auc_score

from tqdm.notebook import tqdm

import os
import scipy

In [2]:
from metrics import recall_k, ndcg_k, repeat_score_item, repeat_score_user
from boosting.create_dataset import Dataset

In [3]:
#history_len=20
#item_embed_size=128
#user_embed_size=32

#h1 = 128
#h2 = 128
#h3 = 128
#h4 = 128
#h5 = 128

dataset_name = 'dunnhumby_cj'

In [4]:
path_train = f'data/{dataset_name}/baskets/train_baskets.csv'
path_test = f'data/{dataset_name}/baskets/test_baskets.csv'
path_val = f'data/{dataset_name}/baskets/valid_baskets.csv'

## Создаем датасет

In [5]:
dataset = Dataset(path_train,path_val, path_test, dataset=dataset_name, history_len=50, basket_len=50)

Total users: 2483
Total items: 36963


In [6]:
train = dataset.create_train_data()

boosting/data/dunnhumby_cj/50_train.npz
Done!


In [7]:
val = dataset.create_val_test_data(mode='val')

boosting/data/dunnhumby_cj/50_val.npz
Done!


In [8]:
test = dataset.create_val_test_data(mode='test')

boosting/data/dunnhumby_cj/50_test.npz
Done!


In [9]:
val.shape, test.shape

((177804, 55), (185085, 55))

In [10]:
train.shape

(13974032, 55)

In [11]:
# функция для предикта (+ добавляем юзеров, для которых нет предсказаний)
def predict(df, th=0):
    
    df['user_id'] = df['user_id'].astype(int)
    df['item_id'] = df['item_id'].astype(int)
    
    test_users = pd.DataFrame(dataset.test_cleaned.user_id.unique(), columns=['user_id'])
    df = df[df.preds_scores>=th]       
    df = df.sort_values(by='preds_scores', ascending=False)
    
    res = df.groupby('user_id').agg({'item_id': list}).reset_index()
    res = test_users.merge(res, how='left')
    res['item_id'] = res['item_id'].fillna("").apply(list)
    return res

## LGBM Ranker

In [12]:
from lightgbm import LGBMRanker

In [13]:
train = train.sort_values(by = 'basket_id')
val = val.sort_values(by = 'basket_id')
test = test.sort_values(by = 'basket_id')

In [14]:
# функция для группировки при ранжировании
get_group_size = lambda df: df.reset_index().groupby("basket_id")['basket_id'].count()

In [15]:
# item_id - категориальная фича
train['item_id'] = train['item_id'].sparse.to_dense().astype('category')
val['item_id'] = val['item_id'].sparse.to_dense().astype('category')
test['item_id'] = test['item_id'].sparse.to_dense().astype('category')

train['basket_id'] = train['basket_id'].sparse.to_dense().astype('int')
val['basket_id'] = val['basket_id'].sparse.to_dense().astype('int')
test['basket_id'] = test['basket_id'].sparse.to_dense().astype('int')

In [16]:
X_tr, y_tr = train.drop(['user_id','labels', 'basket_id'], axis=1), train.labels, 
X_val, y_val = val.drop(['user_id','labels', 'basket_id'], axis=1), val.labels
X_test, y_test = test.drop(['user_id','labels', 'basket_id'], axis=1), test.labels

In [17]:
train_groups = get_group_size(train)
val_groups = get_group_size(val)

In [18]:
ranker = LGBMRanker(n_estimators=300)

In [19]:
ranker.fit(X_tr, y_tr, group=train_groups, 
           eval_set = [(X_val, y_val)], eval_group=[val_groups], 
           early_stopping_rounds=100, eval_at=(5, 10, 20))



[1]	valid_0's ndcg@5: 0.503759	valid_0's ndcg@10: 0.528215	valid_0's ndcg@20: 0.562548
Training until validation scores don't improve for 100 rounds
[2]	valid_0's ndcg@5: 0.535838	valid_0's ndcg@10: 0.55774	valid_0's ndcg@20: 0.588782
[3]	valid_0's ndcg@5: 0.538518	valid_0's ndcg@10: 0.55969	valid_0's ndcg@20: 0.59257
[4]	valid_0's ndcg@5: 0.540969	valid_0's ndcg@10: 0.562128	valid_0's ndcg@20: 0.593077
[5]	valid_0's ndcg@5: 0.540405	valid_0's ndcg@10: 0.562367	valid_0's ndcg@20: 0.59312
[6]	valid_0's ndcg@5: 0.541164	valid_0's ndcg@10: 0.563619	valid_0's ndcg@20: 0.594115
[7]	valid_0's ndcg@5: 0.542061	valid_0's ndcg@10: 0.564278	valid_0's ndcg@20: 0.595464
[8]	valid_0's ndcg@5: 0.540783	valid_0's ndcg@10: 0.563419	valid_0's ndcg@20: 0.595117
[9]	valid_0's ndcg@5: 0.540875	valid_0's ndcg@10: 0.563503	valid_0's ndcg@20: 0.595209
[10]	valid_0's ndcg@5: 0.541627	valid_0's ndcg@10: 0.563634	valid_0's ndcg@20: 0.595649
[11]	valid_0's ndcg@5: 0.542221	valid_0's ndcg@10: 0.564331	valid_0's n

[94]	valid_0's ndcg@5: 0.542718	valid_0's ndcg@10: 0.566479	valid_0's ndcg@20: 0.596647
[95]	valid_0's ndcg@5: 0.542908	valid_0's ndcg@10: 0.566746	valid_0's ndcg@20: 0.597026
[96]	valid_0's ndcg@5: 0.542787	valid_0's ndcg@10: 0.566433	valid_0's ndcg@20: 0.596946
[97]	valid_0's ndcg@5: 0.542787	valid_0's ndcg@10: 0.566438	valid_0's ndcg@20: 0.596948
[98]	valid_0's ndcg@5: 0.542797	valid_0's ndcg@10: 0.566637	valid_0's ndcg@20: 0.59671
[99]	valid_0's ndcg@5: 0.542963	valid_0's ndcg@10: 0.566576	valid_0's ndcg@20: 0.596857
[100]	valid_0's ndcg@5: 0.543127	valid_0's ndcg@10: 0.56657	valid_0's ndcg@20: 0.59685
[101]	valid_0's ndcg@5: 0.543278	valid_0's ndcg@10: 0.566584	valid_0's ndcg@20: 0.596858
[102]	valid_0's ndcg@5: 0.543335	valid_0's ndcg@10: 0.567115	valid_0's ndcg@20: 0.596669
[103]	valid_0's ndcg@5: 0.543335	valid_0's ndcg@10: 0.567134	valid_0's ndcg@20: 0.59674
[104]	valid_0's ndcg@5: 0.543243	valid_0's ndcg@10: 0.567144	valid_0's ndcg@20: 0.596767
[105]	valid_0's ndcg@5: 0.54317

[187]	valid_0's ndcg@5: 0.544235	valid_0's ndcg@10: 0.568757	valid_0's ndcg@20: 0.598171
[188]	valid_0's ndcg@5: 0.544392	valid_0's ndcg@10: 0.568261	valid_0's ndcg@20: 0.597931
[189]	valid_0's ndcg@5: 0.544308	valid_0's ndcg@10: 0.568196	valid_0's ndcg@20: 0.597867
[190]	valid_0's ndcg@5: 0.544287	valid_0's ndcg@10: 0.567922	valid_0's ndcg@20: 0.597824
[191]	valid_0's ndcg@5: 0.544003	valid_0's ndcg@10: 0.567753	valid_0's ndcg@20: 0.597695
[192]	valid_0's ndcg@5: 0.543972	valid_0's ndcg@10: 0.567613	valid_0's ndcg@20: 0.597716
[193]	valid_0's ndcg@5: 0.54401	valid_0's ndcg@10: 0.567705	valid_0's ndcg@20: 0.597853
[194]	valid_0's ndcg@5: 0.543864	valid_0's ndcg@10: 0.567556	valid_0's ndcg@20: 0.597852
[195]	valid_0's ndcg@5: 0.543819	valid_0's ndcg@10: 0.567515	valid_0's ndcg@20: 0.597838
[196]	valid_0's ndcg@5: 0.543873	valid_0's ndcg@10: 0.567602	valid_0's ndcg@20: 0.597872
[197]	valid_0's ndcg@5: 0.54399	valid_0's ndcg@10: 0.567725	valid_0's ndcg@20: 0.597801
[198]	valid_0's ndcg@5:

LGBMRanker(n_estimators=300)

Важность признаков

In [20]:
dict(sorted(dict(zip(X_tr.columns, ranker.feature_importances_)).items(), key=lambda x:x[1],  reverse=True))

{'item_id': 3515,
 49: 273,
 'basket_date': 186,
 48: 161,
 47: 139,
 46: 93,
 45: 68,
 44: 41,
 43: 37,
 39: 33,
 42: 29,
 40: 25,
 34: 21,
 41: 21,
 36: 18,
 38: 18,
 37: 16,
 33: 13,
 35: 13,
 32: 12,
 0: 11,
 25: 10,
 4: 9,
 30: 9,
 31: 9,
 6: 8,
 9: 8,
 12: 8,
 21: 8,
 5: 7,
 26: 7,
 28: 6,
 20: 5,
 23: 5,
 24: 5,
 29: 5,
 2: 4,
 10: 4,
 19: 4,
 22: 4,
 11: 3,
 13: 3,
 15: 3,
 1: 2,
 7: 2,
 14: 2,
 16: 2,
 18: 2,
 8: 1,
 17: 1,
 27: 1,
 3: 0}

In [21]:
preds = ranker.predict(X_test)
roc_auc_score(y_test, preds)

0.7766981149115562

In [22]:
res = test[['user_id', 'item_id']].copy()
res['preds_scores'] = preds
result = predict(res, th = preds.min()-1)

  res = df.groupby('user_id').agg({'item_id': list}).reset_index()


In [23]:
#result['item_id'] = result['item_id'].fillna("").apply(list)

In [24]:
test_baskets = pd.read_csv(path_test)
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))
print('predictions ready', result.user_id.nunique())
print('number of final test users:',result.user_id.nunique())
for k in [5,10,20,'B']:
    print(k)
    recall_scores = {}
    ndcg_scores = {}
    #zero = 0
    for user in result.user_id.unique():
       
        top_items = result[result.user_id==user].item_id.values[0]

        if k == 'B':
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
        else:
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,k)
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,k)
    #print(zero)
    print('recall:',np.mean(list(recall_scores.values())))
    print('ndcg:',np.mean(list(ndcg_scores.values())))

predictions ready 1224
number of final test users: 1224
5
recall: 0.1286724176006153
ndcg: 0.18645087578871805
10
recall: 0.16508327518318464
ndcg: 0.15243571079558688
20
recall: 0.2097245019043515
ndcg: 0.11935744124234535
B
recall: 0.15028780622183388
ndcg: 0.16846776049185638


# Catboost ranker

In [25]:
from catboost import CatBoostRanker, Pool

In [26]:
#X_tr['item_id'] = X_tr['item_id'].astype('str')
#X_val['item_id'] = X_val['item_id'].astype('str')
#X_test['item_id'] = X_test['item_id'].astype('str')

In [27]:
train_p = Pool(
    data=X_tr,
    label=y_tr,
    group_id=train['basket_id'],
    cat_features = ['item_id']
)

val_p = Pool(
    data=X_val,
    label=y_val,
    group_id=val['basket_id'],
    cat_features = ['item_id']
)

In [50]:
ranker = CatBoostRanker(cat_features = ['item_id'], iterations=10, 
                        eval_metric='NDCG', loss_function='PairLogit',
                        #custom_metric = ['RecallAt:top=5', 'RecallAt:top=10','RecallAt:top=20', 
                        #                   'NDCG:top=5', 'NDCG:top=10', 'NDCG:top=20'],
                        one_hot_max_size=300)

In [51]:
ranker.fit(train_p, eval_set = val_p,
           early_stopping_rounds=10)

0:	test: 0.6070670	best: 0.6070670 (0)	total: 51.5s	remaining: 7m 43s
1:	test: 0.6340900	best: 0.6340900 (1)	total: 1m 51s	remaining: 7m 27s
2:	test: 0.6456531	best: 0.6456531 (2)	total: 2m 59s	remaining: 6m 58s
3:	test: 0.6500381	best: 0.6500381 (3)	total: 4m 14s	remaining: 6m 21s
4:	test: 0.6505601	best: 0.6505601 (4)	total: 5m 21s	remaining: 5m 21s
5:	test: 0.6514450	best: 0.6514450 (5)	total: 6m 34s	remaining: 4m 23s
6:	test: 0.6519955	best: 0.6519955 (6)	total: 7m 44s	remaining: 3m 19s
7:	test: 0.6542293	best: 0.6542293 (7)	total: 8m 53s	remaining: 2m 13s
8:	test: 0.6557606	best: 0.6557606 (8)	total: 9m 54s	remaining: 1m 6s
9:	test: 0.6566788	best: 0.6566788 (9)	total: 10m 55s	remaining: 0us

bestTest = 0.6566788376
bestIteration = 9



<catboost.core.CatBoostRanker at 0x7fb8e92fd310>

In [52]:
preds = ranker.predict(X_test)
roc_auc_score(y_test, preds)

0.7638481642710191

In [53]:
res = test[['user_id', 'item_id']].copy()
res['preds_scores'] = preds
result = predict(res, th = preds.min()-1)

  res = df.groupby('user_id').agg({'item_id': list}).reset_index()


In [49]:
test_baskets = pd.read_csv(path_test)
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))
print('predictions ready', result.user_id.nunique())
print('number of final test users:',result.user_id.nunique())
for k in [5,10,20,'B']:
    print(k)
    recall_scores = {}
    ndcg_scores = {}
    #zero = 0
    for user in result.user_id.unique():
       
        top_items = result[result.user_id==user].item_id.values[0]

        if k == 'B':
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
        else:
            recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,k)
            ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,k)
    #print(zero)
    print('recall:',np.mean(list(recall_scores.values())))
    print('ndcg:',np.mean(list(ndcg_scores.values())))

predictions ready 1224
number of final test users: 1224
5
recall: 0.10936589371002715
ndcg: 0.1744576899857848
10
recall: 0.15347440692784745
ndcg: 0.14549135937490631
20
recall: 0.19890798093295847
ndcg: 0.11406264366423985
B
recall: 0.11871559087978246
ndcg: 0.13761358335925986
