In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
import catboost as cb

import sys, os

def current_execute_directory():
    try:
        return os.path.dirname(os.path.abspath(__file__))
    except NameError:
        print('working in jupyter')
        return globals()['_dh'][0]

current_directory = current_execute_directory()
# Написанные нами функции
additional_functions_path = os.path.join(current_directory, os.pardir)
sys.path.insert(0, additional_functions_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, process_user_item_features
from src.recommenders import MainRecommender

import pickle
MODELS_PATH = os.path.join(current_directory, os.pardir, 'models')

working in jupyter



## Read data

In [3]:
DATA_PATH = '../data'
data = pd.read_csv(os.path.join(DATA_PATH,'retail_train_sample.csv'))
item_features = pd.read_csv(os.path.join(DATA_PATH,'product.csv'))
user_features = pd.read_csv(os.path.join(DATA_PATH,'hh_demographic.csv'))

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

data = data.drop('Unnamed: 0', axis=1, errors='ignore')

In [6]:
VAL_MATCHER_WEEKS = 8
VAL_RANKER_WEEKS = 3

data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)] # давние покупки
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# ----
print('data_train_matcher: {}-{} weeks'.format(data_train_matcher.week_no.min(), data_train_matcher.week_no.max()))
print('data_val_matcher: {}-{} weeks'.format(data_val_matcher.week_no.min(), data_val_matcher.week_no.max()))

print('data_train_ranker: {}-{} weeks'.format(data_train_ranker.week_no.min(), data_train_ranker.week_no.max()))
print('data_val_ranker: {}-{} weeks'.format(data_val_ranker.week_no.min(), data_val_ranker.week_no.max()))

data_train_matcher: 1-79 weeks
data_val_matcher: 80-87 weeks
data_train_ranker: 80-87 weeks
data_val_ranker: 88-91 weeks


In [7]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

Here is how the fact table looks like:

In [8]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1078,35573861879,524,1082185,1,0.56,375,0.0,1440,76,0.0,0.0
1,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0
2,1982,32957769022,404,12811490,1,3.99,319,0.0,2101,58,0.0,0.0
3,1023,34573871336,495,920025,1,5.99,299,0.0,1643,71,0.0,0.0
4,695,32672141822,383,941357,1,3.19,396,0.0,1743,55,0.0,0.0


And here are the descriptive datasets:

In [9]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [10]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [11]:
data_train_matcher = prefilter_items(data=data_train_matcher
                                    , item_features=item_features
                                    , take_n_popular=5000)

== Starting prefilter info ==
shape: (194240, 12)
# users: 2481
# items: 32931
Sparsity: 0.238%
== Ending prefilter info ==
shape: (57309, 13)
# users: 2359
# items: 5000
Sparsity: 0.486%
new_columns: {'price'}


In [12]:
def print_stats_data(data, name):
    print(name)
    print(f'shape: {data.shape}\titems: {data[ITEM_COL].nunique()}\tusers: {data[USER_COL].nunique()}')

In [13]:
# make cold-start warm
common_users = data_train_matcher.user_id.values


data_val_matcher = data_val_matcher.loc[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker.loc[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker.loc[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')

print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
shape: (57309, 13)	items: 5000	users: 2359
val_matcher
shape: (21856, 12)	items: 10118	users: 1949
train_ranker
shape: (21856, 12)	items: 10118	users: 1949
val_ranker
shape: (11097, 12)	items: 6378	users: 1674


In [14]:
result_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_matcher.columns = [USER_COL, ACTUAL_COL]

result_matcher.head()

Unnamed: 0,user_id,actual
0,1,"[1075074, 1098248, 1069103, 1017299, 1077430, ..."
1,2,"[830127, 7442008, 899624, 944568]"
2,3,[946839]
3,4,"[990797, 13115703]"
4,5,"[13986893, 1065017, 13212967]"


In [16]:
# recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2359 [00:00<?, ?it/s]

In [16]:
path_to_save = os.path.join(MODELS_PATH, 'baseline_fitted_model.pkl')
pickle.dump(recommender, open(path_to_save, 'wb'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/alina/patriotSSD/geekbrains/gb_recommender_systems/2Level-RecSys-retail/../models/baseline_fitted_model.pkl'

In [18]:
recommender = pickle.load(open(path_to_save, 'rb'))

In [17]:
def get_recommendations(user, model, N):
    if model == 'als':
        return recommender.get_als_recommendations(user, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(user, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(user, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(user, N=N)

In [18]:
matcher_users = result_matcher[USER_COL]

result_matcher['als'] = matcher_users.apply(lambda x: get_recommendations(x, 'als', N_PREDICT))
result_matcher['own'] = matcher_users.apply(lambda x: get_recommendations(x, 'own', N_PREDICT))
result_matcher['similar_items'] = matcher_users.apply(lambda x: get_recommendations(x, 'similar_items', N_PREDICT))
result_matcher['similar_users'] = matcher_users.apply(lambda x: get_recommendations(x, 'similar_users', N_PREDICT))


result_matcher.to_pickle(os.path.join(MODELS_PATH, 'result_matcher.pkl'))
result_matcher.head()

: 

: 

In [23]:
recs = result_matcher.columns.tolist()[2:]

def calc_recall(df_data, columns, top_k):
    for col_name in columns:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

def calc_precision(df_data, columns, top_k):
    for col_name in columns:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

sorted(calc_recall(result_matcher, recs, 50), key=lambda x: x[1],reverse=True)

[]

In [44]:
sorted(calc_precision(result_matcher, recs, 5), key=lambda x: x[1],reverse=True)

[('own', 0.019497178040020526),
 ('als', 0.01385325808106719),
 ('similar_items', 0.011800923550538715),
 ('similar_users', 0.005028219599794766)]

-------

In [None]:
# take users from train ranker
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

# get candidates using model that perfomanced the best
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

: 

: 

In [None]:

# unstack users to items candidates
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = ITEM_COL
# join 
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head()

In [None]:
print_stats_data(data_train_ranker, 'train ranker')
print_stats_data(df_match_candidates, 'match_candidates')

train ranker
shape: (21856, 12)	items: 10118	users: 1949
match_candidates
shape: (97450, 2)	items: 4903	users: 1949


In [None]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1 # only purchases

# merge candidates
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')
# drop dublicates
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [None]:
df_ranker_train.sample(5)

Unnamed: 0,user_id,item_id,target
33336,1336,827683,0.0
51057,1264,883404,0.0
20916,2173,909894,0.0
4026,361,1070898,0.0
72890,1298,1098066,0.0


In [None]:
df_ranker_train.target.value_counts()

0.0    93906
1.0     1675
Name: target, dtype: int64

Generate new features

In [None]:
dt = df_join_train_matcher.copy()
users = user_features.copy()
items = item_features.copy()

users, items = process_user_item_features(dt, users, items)
df_ranker_train = df_ranker_train.merge(users, how='left', on='user_id').merge(items, how='left', on='item_id')
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,quantity_per_month,Unnamed: 12,AUTOMOTIVE,CHEF SHOPPE,CNTRL/STORE SUP,COSMETICS,COUP/STR & MFG,DAIRY DELI,DELI,DRUG GM,FLORAL,FROZEN GROCERY,GARDEN CENTER,GM MERCH EXP,GRO BAKERY,GROCERY,KIOSK-GAS,MEAT,MEAT-PCKGD,MISC SALES TRAN,MISC. TRANS.,NUTRITION,PASTRY,PHARMACY SUPPLY,PHOTO,PRODUCE,RESTAURANT,SALAD BAR,SEAFOOD,SEAFOOD-PCKGD,SPIRITS,TRAVEL & LEISUR,VIDEO RENTAL,total_user_sales_value,user_quantity_per_week,user_quantity_per_baskter,user_freq_per_basket,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,mean_price_by_department,price_rel_mean_by_department,item_quantity_per_basket,total_quantity_value,total_item_sales_value,item_freq,item_freq_per_basket,sales_by_week
0,2498,5995427,0.0,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,1017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2675,0.0,0.0,0.0,0.0,0.0,2.538621,0.0,3.92,6.0,28.64,0.0,2.495,1.5,0.0,0.0,2.836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,224.53,152.0,0.127027,0.00073,1329,GROCERY,National,DRY BN/VEG/POTATO/RICE,RICE - INSTANT & MICROWAVE,8.8 OZ,1.79,2.058508,0.869562,5.8e-05,6.0,9.45,6.0,5.8e-05,0.196875
1,2498,1039840,0.0,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,1017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2675,0.0,0.0,0.0,0.0,0.0,2.538621,0.0,3.92,6.0,28.64,0.0,2.495,1.5,0.0,0.0,2.836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,224.53,152.0,0.127027,0.00073,693,DRUG GM,National,CANDY - PACKAGED,SEASONAL MISCELLANEOUS,1.2 OZ,0.5,3.284538,0.152228,7.7e-05,8.0,4.0,3.0,2.9e-05,2.0
2,2498,5995483,0.0,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,1017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2675,0.0,0.0,0.0,0.0,0.0,2.538621,0.0,3.92,6.0,28.64,0.0,2.495,1.5,0.0,0.0,2.836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,224.53,152.0,0.127027,0.00073,1329,GROCERY,National,DRY BN/VEG/POTATO/RICE,RICE - INSTANT & MICROWAVE,8.8 OZ,1.79,2.058508,0.869562,0.000106,11.0,17.37,8.0,7.7e-05,0.4825


In [None]:
# define categorical columns and convert them to category inplace
object_cols = df_ranker_train.select_dtypes(include=['object']).columns.tolist()
department_cols = [col for col in df_ranker_train.columns if col in item_features.department.unique().tolist()]
categorical_cols = object_cols + department_cols + ['manufacturer']
df_ranker_train[categorical_cols] = df_ranker_train[categorical_cols].astype('category')

In [249]:
def convert_categorical_to_int(series):
    try:
        np.mean(series.values.tolist())
        return series.astype('int64')
    except (ValueError, TypeError):
        return series
    
df_ranker_train.dropna(inplace=True)           
for col in categorical_cols:
    df_ranker_train[col] = convert_categorical_to_int(df_ranker_train[col])

In [None]:
df_ranker_train.to_pickle(path_to_save)

In [250]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]
print('X_train', X_train.shape)
print('y_train', y_train.shape)

X_train (38219, 61)
y_train (38219, 1)


In [251]:
X_train.isna().sum()

user_id                         0
item_id                         0
age_desc                        0
marital_status_code             0
income_desc                     0
homeowner_desc                  0
hh_comp_desc                    0
household_size_desc             0
kid_category_desc               0
quantity_per_month              0
                                0
AUTOMOTIVE                      0
CHEF SHOPPE                     0
CNTRL/STORE SUP                 0
COSMETICS                       0
COUP/STR & MFG                  0
DAIRY DELI                      0
DELI                            0
DRUG GM                         0
FLORAL                          0
FROZEN GROCERY                  0
GARDEN CENTER                   0
GM MERCH EXP                    0
GRO BAKERY                      0
GROCERY                         0
KIOSK-GAS                       0
MEAT                            0
MEAT-PCKGD                      0
MISC SALES TRAN                 0
MISC. TRANS.  

In [252]:
import catboost as cb

train_dataset = cb.Pool(X_train,y_train, 
                        cat_features=categorical_cols)                                                      
# test_dataset = cb.Pool(X_val,y_val,           
                    #    cat_features=categorical_cols)
                    
model = cb.CatBoostClassifier(loss_function='Logloss',  
                              eval_metric='AUC')
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5,],
        'iterations': [50, 100, 150]}
model.grid_search(grid,train_dataset)

pickle.dump(model, path_to_save)
pred = model.predict_proba(X_train)

CatBoostError: catboost/libs/train_lib/dir_helper.cpp:20: Can't create train working dir: catboost_info

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
def plot_roc_auc_score(y_test, y_pred):
    auc = roc_auc_score(y_test, y_pred)

    false_positive_rate, true_positive_rate, thresolds = roc_curve(y_test, y_pred)

    plt.figure(figsize=(10, 8), dpi=100)
    plt.axis('scaled')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("AUC & ROC Curve")
    plt.plot(false_positive_rate, true_positive_rate, 'g')
    plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
    plt.text(0.95, 0.05, 'AUC = %0.4f' % auc, ha='right', fontsize=12, weight='bold', color='blue')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()

In [88]:
from sklearn.metrics import roc_curve
def calc_roc_auc_score():
    pass
model_cols = result_matcher.columns[2:]
k=[1, 2, 5, 10]

recs = result_matcher[model_cols[0]][0][:k[1]]
actual = result_matcher[ACTUAL_COL][0]
tp_fp = np.array([1 if rec in actual else 0 for rec in recs])
tp = tp_fp[tp_fp == 1].size
fp = tp_fp[tp_fp == 0].size
fn_mask = np.array([1 if purchase in recs else 0 for purchase in actual])
fn = fn_mask[fn_mask==0].size
tp, fp, fn


(1, 1, 18)