### Imports

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import time 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
import catboost as cb

import sys, os

def current_execute_directory():
    try:
        return os.path.dirname(os.path.abspath(__file__))
    except NameError:
        print('working in jupyter')
        return globals()['_dh'][0]

current_directory = current_execute_directory()
# Написанные нами функции
additional_functions_path = os.path.join(current_directory, os.pardir)
sys.path.insert(0, additional_functions_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, process_user_item_features, calc_precision, calc_recall
from src.recommenders import MainRecommender, SecondLevelRecommendation
from src.PreprocessMatcherData import PreprocessMatcherData
from src.RecCandidates import RecommenderCandidates
from src import config as cfg
import pickle
MODELS_PATH = os.path.join(current_directory, os.pardir, 'models')

working in jupyter


### Read data

In [4]:
DATA_PATH = 'data/'
data = pd.read_csv(os.path.join(current_directory, 
                                os.pardir,  
                                DATA_PATH,
                                'retail_train.csv'))
item_features = pd.read_csv(os.path.join(current_directory, 
                                         os.pardir,  
                                         DATA_PATH,
                                         'product.csv'))
user_features = pd.read_csv(os.path.join(current_directory, 
                                         os.pardir,  
                                         DATA_PATH,
                                         'hh_demographic.csv'))

Here is how the **fact table** looks like:

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


And here are the **descriptive** datasets:

In [4]:
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [5]:
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [5]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N - Neighbors
N_PREDICT = 50

### Preprocess 1st level dataset with PreprocessMatcherData class

In [33]:
preprocess_start_time = time.time()
print('{:*^46}'.format('Preprocess first-level dataset'))
preprocess = PreprocessMatcherData(data, item_features, user_features)
train_matcher = preprocess.train_matcher
val_matcher = preprocess.val_matcher

preprocess_end_time = time.time()
preprocess_time = preprocess_end_time - preprocess_start_time
print('Data preprocessed in \
      {} min {:.2f} sec'.format(int(preprocess_time//60), preprocess_time%60))

********Preprocess first-level dataset********
input data splitted:
train_matcher: 1-83 weeks
val_matcher: 84-91 weeks
train_ranker: 84-91 weeks
val_ranker: 92-95 weeks
== Starting prefilter info ==
shape: (2050409, 12)
# users: 2498
# items: 82583
Sparsity: 0.994%
== Ending prefilter info ==
shape: (587125, 13)
# users: 2471
# items: 5000
Sparsity: 4.752%
new_columns: {'price'}
Data preprocessed in       0 min 21.55 sec


### Get candidates from the 1st level recommendation model

In [8]:
candidates_start_time = time.time()
print('{:*^46}'.format(' Candidates processing '))
candidates_process = RecommenderCandidates(train_matcher, val_matcher) # fit 1st model
candidates_df = candidates_process.get_candidates(
     val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index(name=ACTUAL_COL),
     unstack=True)

candidates_end_time = time.time()
candidates_time = candidates_end_time - candidates_start_time
print('Got candidates in \
     {} min {:.2f} sec'.format(int(candidates_time//60), candidates_time%60))

pickle.dump(candidates_process, open('../data/candidates_process.pkl', 'wb'))
pickle.dump(candidates_df, open('../data/candidates_df.pkl', 'wb'))
candidates_df

*********** Candidates processing ************


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

Got candidates in      28 min 10.96 sec


Unnamed: 0,user_id,item_id
0,0,924423
0,0,13672182
0,0,848761
0,0,977545
0,0,1097792
...,...,...
2207,2207,948626
2207,2207,926917
2207,2207,10254382
2207,2207,1092948


In [7]:
candidates_df = pickle.load(open('semi_finished_data/candidates_df.pkl', 'rb'))
candidates_process = pickle.load(open('semi_finished_data/candidates_process.pkl', 'rb'))

In [7]:
candidates_df.user_id.unique().shape

(2208,)

### Catboost gridsearch

In [8]:
scnd_lvl_model = SecondLevelRecommendation(
    balanced=True, 
    ranker_train=preprocess.train_ranker, 
    ranker_val=preprocess.val_ranker, 
    candidates_df=candidates_df, 
    df_matcher=pd.concat([preprocess.train_matcher,
    preprocess.val_matcher]),
    item_features=preprocess.item_features,
    user_features=preprocess.user_features
)

In [9]:
best, cv_res = scnd_lvl_model.catboost_gridsearch()
print(best)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 193ms	remaining: 9.48s
1:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 337ms	remaining: 8.08s
2:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 413ms	remaining: 6.48s
3:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 464ms	remaining: 5.34s
4:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 506ms	remaining: 4.56s
5:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 539ms	remaining: 3.95s
6:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 579ms	remaining: 3.55s
7:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 610ms	remaining: 3.2s
8:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 635ms	remaining: 2.89s
9:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 699ms	remaining: 2.8s
10:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0)	total: 748ms	remaining: 2.65s
11:	learn: 0.9726919	test: 0.9714081	best: 0.9714081 (0

### Fit 2nd level model (catboost) with SecondLevelRecommendation class

In [11]:
params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'depth': 6,
        'l2_leaf_reg': 3,
        'iterations': 500,
        'learning_rate': 0.1,
        'verbose': False
        }

cb_start_time = time.time()
print('{:*^46}'.format('Train second-level model'))
cb_model = cb.CatBoostClassifier(**params)
second_lvl_model = SecondLevelRecommendation(
                                              balanced=True,
                                              ranker_train=preprocess.train_ranker,
                                              ranker_val=preprocess.val_ranker,
                                              candidates_df=candidates_df,
                                              df_matcher=pd.concat([preprocess.train_matcher,
                                                                   preprocess.val_matcher]),
                                              item_features=preprocess.item_features,
                                              user_features=preprocess.user_features
                                            )
second_lvl_model.fit(model=cb_model)
cb_end_time = time.time()
cb_time = cb_end_time - cb_start_time
print('Trained second level model in \
      {} min {:.2f} sec'.format(int(cb_time//60), cb_time%60))

pickle.dump(second_lvl_model, open('../models/second_lvl_fitted.pkl', 'wb'))

***********Train second-level model***********
Trained second level model in       2 min 28.46 sec


In [17]:
second_lvl_model = pickle.load(open('../models/second_lvl_fitted.pkl', 'rb'))
second_lvl_model.model.is_fitted()

True

### Test Evaluation

In [35]:
test_df = pd.read_csv('../data/retail_test1.csv')
test_df_common = test_df[test_df.user_id.isin(preprocess.common_users)]
result_eval_ranker = test_df_common.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]

# get candidates for test users
test_candidates = candidates_process.get_candidates(result_eval_ranker, unstack=True)
test_candidates = test_candidates.groupby(USER_COL)[ITEM_COL].unique().reset_index(name='candidates')

result_eval_ranker = result_eval_ranker.merge(test_candidates, on='user_id')

In [37]:
result_eval_ranker

Unnamed: 0,user_id,actual,candidates
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[12263614, 983936, 6979427, 1048383, 1021152, ..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[965982, 929472, 972041, 1082544, 969463, 1341..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[932182, 1065039, 908105, 1009449, 1088634, 82..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1874568, 962568, 1098066, 1075470, 909894, 88..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[1874568, 999971, 928049, 900370, 868764, 8199..."
...,...,...,...
1383,1857,"[9297370, 893501, 925258, 1092026]","[874877, 963987, 1044971, 1049998, 887393, 837..."
1384,1858,"[834993, 843819, 1058997, 1061522, 1081177, 81...","[1106523, 1070015, 1037894, 912233, 928049, 10..."
1385,1861,"[844179, 863447, 926025, 964968, 986150, 99524...","[828139, 9364200, 967476, 13417591, 1114992, 8..."
1386,1862,"[854852, 834484, 848071, 970202, 1025650, 1049...","[993913, 905582, 1100125, 872729, 1031697, 831..."


### Dealing with new users

In [20]:
# concat new users to get candidates by baseline
new_users = test_df[~test_df[cfg.USER_COL].isin(test_df_common[cfg.USER_COL])].groupby([cfg.USER_COL]).item_id.unique().reset_index(name=ACTUAL_COL)
conc = pd.concat([result_eval_ranker, new_users], ignore_index=True)
conc.loc[conc['candidates'].isna(), 'candidates'] = \
     conc[conc['candidates'].isna()][USER_COL].apply(
        lambda x: candidates_process.model.get_similar_items_recommendation(x, 5)
        )
result_eval_ranker = conc
conc_df = conc[[USER_COL, 'candidates']]
conc_df.columns = [USER_COL, ITEM_COL]
dt = []

for i in conc_df.itertuples():
    lst = i[2]
    for col2 in lst:
        dt.append([i[1], col2])

second_lvl_model.balanced = True
test_candidates = pd.DataFrame(data=dt, columns=conc_df.columns)
df_ranker_test = second_lvl_model.merge_candidates_ranker_data(test_candidates, second_lvl_model.ranker_train)
df_ranker_test.target.value_counts()

0.0    26113
1.0     1211
Name: target, dtype: int64

In [30]:
X_test = df_ranker_test.drop('target', axis=1)
result_metrics = {}


preds = second_lvl_model.predict_proba(X_test[second_lvl_model.model.feature_names_])
df_ranker_predict = X_test[[USER_COL, ITEM_COL]].copy()
df_ranker_predict['proba_item_purchase'] = preds[:,1]
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[cfg.USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5)[cfg.ITEM_COL].tolist()

result_eval_ranker['catboost_clf'] = result_eval_ranker[cfg.USER_COL].apply(lambda user_id: rerank(user_id))

In [22]:
mask = result_eval_ranker.catboost_clf.map(lambda d: len(d) == 0)
result_eval_ranker.loc[mask, 'catboost_clf'] = result_eval_ranker.loc[mask, 'candidates']
result_eval_ranker.sample(5)

Unnamed: 0,user_id,actual,candidates,catboost_clf
983,1331,"[1026891, 1031316]","[1051402, 932179, 838602, 980666, 1044507, 821...","[1051402, 932179, 838602, 980666, 1044507, 821..."
558,749,[1028695],"[999730, 1007116, 1037894, 877358, 1070820, 10...","[999730, 1007116, 1037894, 877358, 1070820, 10..."
508,692,"[822073, 878996, 883287, 900697, 901452, 97241...","[5591490, 1103812, 909761, 1027447, 13095651, ...","[948190, 928486, 1128665, 993639, 1027447]"
1410,567,"[935008, 1044078, 1054383, 1076875, 10285116, ...","[904360, 883404, 951590, 1133018, 961554]","[904360, 883404, 951590, 1133018, 961554]"
798,1088,"[861701, 865330, 878570, 997011, 1036297, 1045...","[1077143, 928123, 896849, 915651, 1000707, 819...","[1077143, 928123, 896849, 915651, 1000707, 819..."


### Metrics Tables from CatBoostClassifier predictions

In [31]:
from src.utils import calc_recall, calc_precision

In [24]:
result_metrics = dict(candidates={}, catboost_clf={})

for N in (1, 10, 100, 200):
    p = sorted(calc_precision(result_eval_ranker, N), key=lambda x: x[1],reverse=True)
    for rec in p:
        result_metrics[rec[0]][f'Precision@{N}'] = round(rec[1], 5)
    r = sorted(calc_recall(result_eval_ranker, N), key=lambda x: x[1],reverse=True)
    for rec in r:
        result_metrics[rec[0]][f'Recall@{N}'] = round(rec[1], 5)

In [25]:
import tabulate
dd = pd.DataFrame.from_records(result_metrics).sort_index(axis=0)
print(tabulate.tabulate(dd, ['metric', 'candidates', 'cb_model'], tablefmt='fancy_grid'))

╒═══════════════╤══════════════╤════════════╕
│ metric        │   candidates │   cb_model │
╞═══════════════╪══════════════╪════════════╡
│ Precision@1   │      0.00672 │    0.02352 │
├───────────────┼──────────────┼────────────┤
│ Precision@10  │      0.01667 │    0.02614 │
├───────────────┼──────────────┼────────────┤
│ Precision@100 │      0.01725 │    0.026   │
├───────────────┼──────────────┼────────────┤
│ Precision@200 │      0.01725 │    0.026   │
├───────────────┼──────────────┼────────────┤
│ Recall@1      │      0.00017 │    0.00053 │
├───────────────┼──────────────┼────────────┤
│ Recall@10     │      0.00531 │    0.00584 │
├───────────────┼──────────────┼────────────┤
│ Recall@100    │      0.0164  │    0.01306 │
├───────────────┼──────────────┼────────────┤
│ Recall@200    │      0.0164  │    0.01306 │
╘═══════════════╧══════════════╧════════════╛


In [32]:
print(tabulate.tabulate(pd.DataFrame(second_lvl_model.evaluate_model(X_test=X_test)), headers='keys', tablefmt='fancy_grid'))

╒═══════════════╤════════════════╕
│               │   catboost_clf │
╞═══════════════╪════════════════╡
│ Precision@1   │      1         │
├───────────────┼────────────────┤
│ Precision@10  │      1         │
├───────────────┼────────────────┤
│ Precision@100 │      1         │
├───────────────┼────────────────┤
│ Precision@200 │      1         │
├───────────────┼────────────────┤
│ Recall@1      │      0.0200963 │
├───────────────┼────────────────┤
│ Recall@10     │      0.100481  │
├───────────────┼────────────────┤
│ Recall@100    │      0.100481  │
├───────────────┼────────────────┤
│ Recall@200    │      0.100481  │
╘═══════════════╧════════════════╛


In [27]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
result_metrics = {}
X_val, y_true = second_lvl_model.X_val, second_lvl_model.y_val
preds = second_lvl_model.model.predict(X_val)

d = dict(y_true=y_true, y_pred=preds)
result_metrics['precision'] = precision_score(**d)
result_metrics['recall'] = recall_score(**d)
result_metrics['roc_auc_score'] = roc_auc_score(y_true=y_true, y_score=preds)
result_metrics

{'precision': 0.11197713691868018,
 'recall': 0.7171381031613977,
 'roc_auc_score': 0.8088469020942133}