In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

## Read data

In [25]:
DATA_PATH = '../data'
data = pd.read_csv(os.path.join(DATA_PATH,'retail_train.csv'))
item_features = pd.read_csv(os.path.join(DATA_PATH,'product.csv'))
user_features = pd.read_csv(os.path.join(DATA_PATH,'hh_demographic.csv'))

In [26]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

In [27]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [28]:
VAL_MATCHER_WEEKS = 8
VAL_RANKER_WEEKS = 3

data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)] # давние покупки
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# ----
print('data_train_matcher: {}-{} weeks'.format(data_train_matcher.week_no.min(), data_train_matcher.week_no.max()))
print('data_val_matcher: {}-{} weeks'.format(data_val_matcher.week_no.min(), data_val_matcher.week_no.max()))

print('data_train_ranker: {}-{} weeks'.format(data_train_ranker.week_no.min(), data_train_ranker.week_no.max()))
print('data_val_ranker: {}-{} weeks'.format(data_val_ranker.week_no.min(), data_val_ranker.week_no.max()))

data_train_matcher: 1-83 weeks
data_val_matcher: 84-91 weeks
data_train_ranker: 84-91 weeks
data_val_ranker: 92-95 weeks


In [29]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [30]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

print('data_train shape: %d\tdata_test shape: %d' % (data_train.shape[0], data_test.shape[0]))

data_train shape: 2278490	data_test shape: 118314


In [31]:
pd.set_option('display.max_columns', None)

In [43]:
result_matcher = data_test.groupby('user_id').item_id.unique().reset_index().rename(columns={'item_id': 'actual'})

result_matcher.head()

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [33]:
test_users = result_matcher.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('There are {} users in test dataset'.format(test_users))
print('here are {} new users in test dataset'.format(new_test_users))

There are 2042 users in test dataset
here are 0 new users in test dataset


Here is how the fact table looks like:

In [13]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


And here are the descriptive datasets:

In [34]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [35]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [36]:
data_train_matcher = prefilter_items(data=data_train_matcher
                                    , item_features=item_features
                                    , take_n_popular=5000)

== Starting prefilter info ==
shape: (2050409, 12)
# users: 2498
# items: 82583
Sparsity: 0.994%
== Ending prefilter info ==
shape: (587125, 13)
# users: 2471
# items: 5000
Sparsity: 4.752%


In [39]:
def print_stats_data(data, name):
    print(name)
    print(f'shape: {data.shape}\titems: {data[ITEM_COL].nunique()}\tusers: {data[USER_COL].nunique()}')

In [40]:
# make cold-start warm
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
shape: (587125, 13)	items: 5000	users: 2471
val_matcher
shape: (226449, 12)	items: 31102	users: 2208
train_ranker
shape: (226449, 12)	items: 31102	users: 2208
val_ranker
shape: (117465, 12)	items: 24236	users: 2021


In [41]:
recommender = MainRecommender(data_train_matcher)

100%|██████████| 15/15 [00:02<00:00,  6.71it/s]
100%|██████████| 5000/5000 [00:00<00:00, 8887.85it/s] 


In [45]:
def get_recommendations(user, model, N):
    if model == 'als':
        return recommender.get_als_recommendations(user, N=N)
    elif model == 'own':
        return recommender.get_own_recommendations(user, N=N)
    elif model == 'similar_items':
        return recommender.get_similar_items_recommendation(user, N=N)
    elif model == 'similar_users':
        return recommender.get_similar_users_recommendation(user, N=N)

In [46]:
result_matcher['als'] = result_matcher['user_id'].apply(lambda x: get_recommendations(x, 'als', N_PREDICT))
result_matcher['own'] = result_matcher['user_id'].apply(lambda x: get_recommendations(x, 'own', N_PREDICT))
result_matcher['similar_items'] = result_matcher['user_id'].apply(lambda x: get_recommendations(x, 'similar_items', N_PREDICT))
result_matcher['similar_users'] = result_matcher['user_id'].apply(lambda x: get_recommendations(x, 'similar_users', N_PREDICT))

result_matcher.head()

IndexError: index 2472 is out of bounds for axis 0 with size 2472

In [52]:
result_matcher.loc[result_matcher.index == result_matcher.index.max()]

Unnamed: 0,user_id,actual
2041,2500,"[852182, 856345, 923746, 948670, 1018007, 1044..."
