In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
%matplotlib inline
from scipy.sparse import csr_matrix
from implicit import als
from lightgbm import LGBMClassifier
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

### Read data

In [292]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

### Process features dataset

In [293]:
item_col = 'item_id'
user_col = 'user_id'

In [294]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': item_col}, inplace=True)
user_features.rename(columns={'household_key': user_col}, inplace=True)

### Split dataset for train, eval, test

In [295]:
val_matcher_weeks = 6
val_ranker_weeks = 3

In [296]:
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (val_matcher_weeks + val_ranker_weeks)]

In [297]:
data['week_no'].max() - (val_matcher_weeks + val_ranker_weeks)

data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (val_matcher_weeks + val_ranker_weeks)) &
                       (data['week_no'] < data['week_no'].max() - (val_ranker_weeks))]

data_train_ranker = data_val_matcher.copy()

data_val_ranker = data[data['week_no'] >= data['week_no'].max() - val_ranker_weeks]

In [298]:
def print_stats_data(df_data, name_df):
    """Print data shape, users and item numbers."""
    
    print(name_df)
    print(f'Shape: {df_data.shape} Users: {df_data[user_col].nunique()} Items: {df_data[item_col].nunique()}')

In [299]:
print_stats_data(data_train_matcher, 'train_matcher')
print_stats_data(data_val_matcher, 'val_matcher')
print_stats_data(data_train_ranker, 'train_ranker')
print_stats_data(data_val_ranker, 'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [300]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [301]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [302]:
data_train_ranker.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [303]:
data_val_ranker.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


### Prefilter items

In [304]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print(f'Decreased # items from {n_items_before} to {n_items_after}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


### Make cold-start to warm-start

In [305]:
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher, 'train_matcher')
print_stats_data(data_val_matcher, 'val_matcher')
print_stats_data(data_train_ranker, 'train_ranker')
print_stats_data(data_val_ranker, 'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 5001
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


### Init/train recommender

In [307]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [308]:
ACTUAL_COL = 'actual'

In [309]:
result_eval_matcher = data_val_matcher.groupby(user_col)[item_col].unique().reset_index()
result_eval_matcher.columns = [user_col, ACTUAL_COL]
result_eval_matcher.head()

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [310]:
def select_match_candidates(df):
    """Selection of mathing candidates."""

    df['own_rec'] = df[user_col].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
    df['sim_item_rec'] = df[user_col].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
    df['als_rec'] = df[user_col].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))
    df['sim_user_rec'] = df[user_col].apply(lambda x: recommender.get_similar_users_recommendation(x, N=N_PREDICT))
    
    return df

In [20]:
N_PREDICT = 500

In [21]:
%%time

test_matcher_500 = select_match_candidates(result_eval_matcher.copy())

Wall time: 59min 26s


In [311]:
test_matcher_500.head()

Unnamed: 0,user_id,actual,own_rec,sim_item_rec,als_rec,sim_user_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[824758, 1007512, 9297615, 5577022, 965956, 98...","[856942, 1037332, 883616, 1062572, 824758, 108...","[918638, 949257, 916990, 1076187, 9526676, 994..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[8090509, 5569845, 1044078, 985999, 880888, 81...","[5569230, 916122, 5569845, 8090521, 6534480, 6...","[9419422, 6772833, 1063739, 1075368, 9655078, ..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[6391541, 1052294, 891423, 936470, 1137010, 11...","[1083856, 973135, 1048068, 999714, 8069350, 10...","[821741, 987044, 891423, 1091383, 1002771, 639...","[894360, 862535, 956125, 1122844, 832661, 9486..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[13003092, 972416, 995598, 923600, 1138596, 10...","[948650, 5569845, 8357613, 941361, 1074754, 11...","[1026118, 933637, 1098248, 1023720, 965267, 95...","[1113675, 948640, 7442505, 823031, 1074040, 10..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[998519, 894360, 7147142, 9338009, 896666, 939...","[832513, 1044078, 7147145, 880427, 1007462, 96...","[1041688, 8276172, 1100140, 8293447, 1039627, ...","[852015, 6552318, 837495, 6533681, 8020166, 80..."


In [23]:
def calc_recall(df_data, top_k):
    """Calculate recall."""
    
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [24]:
def calc_precision(df_data, top_k):
    """Calculate precision."""
    
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

#### Recall@20 of matching

In [312]:
TOPK_RECALL = 20

In [313]:
sorted(calc_recall(test_matcher_500, TOPK_RECALL), key=lambda x: x[1], reverse=True)

[('own_rec', 0.03928427679372909),
 ('als_rec', 0.030220330420997067),
 ('sim_item_rec', 0.016568871997012843),
 ('sim_user_rec', 0.004219863234793127)]

#### Recall@50 of matching

In [314]:
TOPK_RECALL = 50

In [315]:
sorted(calc_recall(test_matcher_500, TOPK_RECALL), key=lambda x: x[1], reverse=True)

[('own_rec', 0.06525657038145175),
 ('als_rec', 0.04835572223108255),
 ('sim_item_rec', 0.03335198066643014),
 ('sim_user_rec', 0.007094196375712481)]

#### Recall@100 of matching

In [316]:
TOPK_RECALL = 100

In [317]:
sorted(calc_recall(test_matcher_500, TOPK_RECALL), key=lambda x: x[1], reverse=True)

[('own_rec', 0.09604492955885034),
 ('als_rec', 0.06915950811947766),
 ('sim_item_rec', 0.05341166594997311),
 ('sim_user_rec', 0.010632450450746211)]

#### Recall@200 of matching

In [318]:
TOPK_RECALL = 200

In [319]:
sorted(calc_recall(test_matcher_500, TOPK_RECALL), key=lambda x: x[1], reverse=True)

[('own_rec', 0.13537278412833242),
 ('als_rec', 0.09782318920120979),
 ('sim_item_rec', 0.08556231580435741),
 ('sim_user_rec', 0.015426246201126516)]

#### Recall@500 of matching

In [320]:
TOPK_RECALL = 500

In [321]:
sorted(calc_recall(test_matcher_500, TOPK_RECALL), key=lambda x: x[1], reverse=True)

[('own_rec', 0.18205324555508678),
 ('als_rec', 0.14831537951778287),
 ('sim_item_rec', 0.13598744766595136),
 ('sim_user_rec', 0.025364568302290943)]

#### Precision@5 of matching

In [322]:
TOPK_PRECISION = 5

In [323]:
sorted(calc_precision(test_matcher_500, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.17712691771268974),
 ('als_rec', 0.11752673175267211),
 ('sim_item_rec', 0.06034402603440302),
 ('sim_user_rec', 0.013854021385402105)]

## Ranking part

### Preparation train data

In [324]:
N_PREDICT = 50

In [325]:
df_match_candidates = pd.DataFrame(data_train_ranker[user_col].unique())
df_match_candidates.columns = [user_col]

In [326]:
df_match_candidates['candidates'] = df_match_candidates[user_col].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [327]:
df_match_candidates.head()

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."
2,1753,"[967041, 963686, 948640, 1057168, 942475, 9421..."
3,2120,"[5707857, 1029743, 1106523, 5569230, 916122, 8..."
4,1346,"[1135983, 5569309, 1129982, 5574377, 5569993, ..."


In [328]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [329]:
df_items

0       1105426
0       1097350
0        879194
0        948640
0        928263
         ...   
2150     903454
2150    9419888
2150    1076769
2150    1092588
2150    1024051
Name: item_id, Length: 107550, dtype: int64

In [330]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [331]:
df_match_candidates.head()

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640
0,2070,928263


In [46]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107550, 2) Users: 2151 Items: 4574


In [47]:
df_ranker_train = data_train_ranker[[user_col, item_col]].copy()
df_ranker_train['target'] = 1

In [48]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


In [49]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[user_col, item_col], how='left')

df_ranker_train = df_ranker_train.drop_duplicates(subset=[user_col, item_col])

df_ranker_train['target'].fillna(0, inplace=True)

In [50]:
df_ranker_train.target.value_counts()

0.0    99177
1.0     7795
Name: target, dtype: int64

In [70]:
df_ranker_train['target'].mean()

0.07286953595333358

### Gradient boosting

#### Features preparation

In [51]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [52]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [71]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0
2,2070,879194,0.0
3,2070,948640,0.0
4,2070,928263,0.0


In [73]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [54]:
def eval_user_average_check(user_id):
    """Evaluate user average check."""
    
    user_average_check = df_ranker_train[df_ranker_train['user_id'] == user_id].groupby('user_id')['sales_value'].sum() / \
                         df_ranker_train[df_ranker_train['user_id'] == user_id].groupby('user_id')['sales_value'].count()

    return user_average_check[user_id]

In [75]:
%%time
df_ranker_train['average_check'] = df_ranker_train['user_id'].apply(lambda x: eval_user_average_check(x))

Wall time: 2min 56s


In [76]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931


In [83]:
def eval_sales_quantity(user_id):
    """Evaluate user purchases quantity."""
    
    user_sales_quantity = data_train_ranker[data_train_ranker['user_id'] == user_id].groupby('user_id')['quantity'].sum()
    
    return user_sales_quantity[user_id]

In [84]:
%%time
df_ranker_train['purchases_quantity'] = df_ranker_train['user_id'].apply(lambda x: eval_sales_quantity(x))

Wall time: 1min 35s


In [85]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_quantity
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550


In [107]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [112]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [333]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_quantity,price
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,3.905593
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,11.471481


In [271]:
def eval_price(item_id):
    """Evaluate item price."""
    
    price = data[data['item_id'] == item_id].groupby('item_id')['sales_value'].sum() / \
            data.groupby('item_id')['quantity'].sum()
    
    return price[item_id]

In [273]:
%%time
df_ranker_train['price'] = df_ranker_train['item_id'].apply(lambda x: eval_price(x))

Wall time: 3h 36min 35s


In [334]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_quantity,price
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,3.905593
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,11.471481


In [None]:
# %%time
# df_ranker_train['price'] = data_train_ranker.apply(lambda x: eval_week_purchases_number(x.item_id,\
#                                                                                                 x.week_no), axis=1)

In [335]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [336]:
cat_feats = X_train.columns[2:-3].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [337]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)




In [338]:
df_ranker_predict = df_ranker_train.copy()

In [339]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [340]:
df_ranker_predict.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,average_check,purchases_quantity,price,proba_item_purchase
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,3.905593,0.038727
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.025931,17550,11.471481,0.015334


### Evaluation on test dataset

In [341]:
result_eval_ranker = data_val_ranker.groupby(user_col)[item_col].unique().reset_index()
result_eval_ranker.columns=[user_col, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [342]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[user_col].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 6.58 s


In [343]:
sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1444117647058813)]

In [344]:
def rerank(user_id):
    """Rerank matching."""
    
    return df_ranker_predict[df_ranker_predict[user_col]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [345]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[user_col].apply(lambda user_id: rerank(user_id))

In [346]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.1645953002610952)
('own_rec', 0.1444117647058813)


  return flags.sum() / len(recommended_list)
