## Финальный проект

- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 0.27%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 

### Подключение библиотек и скриптов

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:72.5% !important; }</style>"))

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from scipy.sparse import csr_matrix

from implicit import als
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# from statistics import mean

from src.metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items, extend_user_item_new_features, get_important_features, get_final_recomendations
from src.recommenders import MainRecommender

from tqdm import tqdm
tqdm.pandas()

In [4]:
plt.rcParams.update({'font.size': 14})
pd.set_option('precision', 3)
pd.set_option('max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 300)

### Пути к директориям и файлам / Загрузка данных

In [5]:
data = pd.read_csv('data/retail_train_sample.csv')
data_test = pd.read_csv('data/retail_test1.csv')

item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

In [6]:
data

Unnamed: 0.1,Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1832874,1078,35573861879,524,1082185,1,0.56000,375,0.00000,1440,76,0.00000,0.00000
1,402281,324,29170411703,165,7168774,2,6.98000,367,0.00000,1115,24,0.00000,0.00000
2,1348564,1982,32957769022,404,12811490,1,3.99000,319,0.00000,2101,58,0.00000,0.00000
3,1714815,1023,34573871336,495,920025,1,5.99000,299,0.00000,1643,71,0.00000,0.00000
4,1266182,695,32672141822,383,941357,1,3.19000,396,0.00000,1743,55,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227844,636125,881,30217075044,226,917816,1,4.02000,445,0.00000,1740,33,0.00000,0.00000
227845,673777,92,30578626938,235,987838,1,0.69000,374,0.00000,1511,34,0.00000,0.00000
227846,63366,409,27732789861,63,883306,2,2.00000,367,-0.78000,1728,10,0.00000,0.00000
227847,289453,18,28697885642,135,961554,1,1.00000,414,-0.69000,1230,20,0.00000,0.00000


In [7]:
# data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data.drop(['Unnamed: 0'], axis=1, inplace=True)

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [None]:
# Количество рекомендаций
N=150 

VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val.copy()

### First level model

In [8]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular = 777)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 34338 to 3001


In [9]:
recommender = MainRecommender(train_1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3001.0), HTML(value='')))




In [10]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Add New Features

![image](0_PSZK-yUYwQJsN5A8.png) 

In [11]:
train = extend_user_item_new_features(train_2, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc_x,coupon_match_disc,price,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,11_x,12_x,13_x,14_x,15_x,16_x,17_x,18_x,19_x,coupon_disc_y,sales_count_per_dep,qnt_of_sales_per_item_per_dep_per_week,quantity_of_sales,sales_count_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,9_y,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y,mean_time,age,income,children,avr_bask,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,1268,41260142401,635,1027102,1,1.19,31742,-0.1,1752,91,0.0,0.0,1.19,764,GROCERY,National,WAREHOUSE SNACKS,CANISTER POTATO/TORT CHIPS,5.125 OZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11135,0.04164,1,0.16667,23,0.03305,U,Homeowner,2 Adults Kids,4,-0.62519,-1.02351,2.75077,-2.45418,-2.63651,0.19711,6.09738,-0.82764,4.03496,-0.65235,1.16003,-1.83066,-0.21952,2.07773,-4.90499,1.42154,3.01281,3.07661,-3.04802,2.58444,1596.0,40.0,15.0,2.0,6.34,4.22667,0.09091,0.08201,0.0
1,1240,40841021630,611,900358,2,2.67,375,-1.11,1153,88,0.0,0.0,1.335,999999999,GROCERY,National,OLIVES,RIPE OLIVES,6 OZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11135,0.04164,3,0.5,16,0.0404,U,Homeowner,2 Adults No Kids,2,9.64213,2.60239,0.91207,6.28148,-1.41772,-1.31956,-5.1998,0.72529,14.85787,-6.49456,-1.06729,1.36767,-2.77351,-0.66267,-2.75059,-6.57307,6.99544,-2.89659,8.78763,-3.13539,1472.94116,50.0,15.0,0.0,4.54559,12.87917,0.25,0.18376,0.0
2,386,41259157348,633,1058686,1,1.0,410,0.0,2111,91,0.0,0.0,1.0,181,GROCERY,National,DRY BN/VEG/POTATO/RICE,NOODLE SIDE DISH MIXES,4.6 OZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11135,0.04164,2,0.33333,28,0.04023,B,Probable Owner,Single Female,1,2.60841,7.71556,0.47389,-5.24648,7.57187,-2.61625,-2.94132,4.46773,-9.81925,4.28712,-3.92433,-5.76048,3.18639,0.29442,-0.12397,-0.5768,-3.60939,10.4043,0.57809,9.50372,1764.75,40.0,30.0,0.0,0.935,2.49333,0.03448,0.04919,0.0
3,2107,40788501083,607,916122,2,14.68,450,-2.94,1251,87,0.0,0.0,7.34,999999999,MEAT,National,CHICKEN,CHICKEN BREAST BONELESS,,0.00892,0.00754,0.00526,0.01283,0.00987,0.01097,0.01055,0.01216,0.0114,0.00798,0.01116,0.01423,0.00841,0.00771,0.00797,0.01269,0.01261,0.01009,0.01232,0.01173,0.0,541,0.03158,26,4.33333,36,0.06818,A,Homeowner,2 Adults Kids,5+,-0.47717,1.60205,-4.01173,5.46621,-1.89164,-3.5407,1.20226,3.87196,3.909,7.5948,9.75381,-0.63935,1.74283,-8.67358,-2.70649,-5.05198,0.38578,-5.4475,4.73088,5.17536,1440.75,40.0,95.0,0.0,2.83,3.77333,0.08,0.10866,1.0
4,2107,40788501083,607,916122,2,14.68,450,-2.94,1251,87,0.0,0.0,7.34,999999999,MEAT,National,CHICKEN,CHICKEN BREAST BONELESS,,0.00892,0.00754,0.00526,0.01283,0.00987,0.01097,0.01055,0.01216,0.0114,0.00798,0.01116,0.01423,0.00841,0.00771,0.00797,0.01269,0.01261,0.01009,0.01232,0.01173,0.0,541,0.03158,26,4.33333,36,0.06818,A,Homeowner,2 Adults Kids,5+,-0.47717,1.60205,-4.01173,5.46621,-1.89164,-3.5407,1.20226,3.87196,3.909,7.5948,9.75381,-0.63935,1.74283,-8.67358,-2.70649,-5.05198,0.38578,-5.4475,4.73088,5.17536,1440.75,40.0,95.0,0.0,2.83,3.77333,0.08,0.10866,1.0


In [12]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [13]:
cat_features=[]
for col in X_train.columns:
    if(X_train[col].dtype == np.object):
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [25]:
cat_features

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']

In [14]:
test = extend_user_item_new_features(data_test, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

### Second level model

In [15]:
lgbc = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_features)

In [16]:
important_features = get_important_features(lgbc, X_train, y_train)
important_features

['user_id',
 'basket_id',
 'day',
 'item_id',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'price',
 'manufacturer',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 '0_x',
 '1_x',
 '2_x',
 '3_x',
 '4_x',
 '5_x',
 '6_x',
 '7_x',
 '8_x',
 '9_x',
 '10_x',
 '11_x',
 '12_x',
 '13_x',
 '14_x',
 '15_x',
 '16_x',
 '17_x',
 '18_x',
 '19_x',
 'coupon_disc_y',
 'sales_count_per_dep',
 'qnt_of_sales_per_item_per_dep_per_week',
 'quantity_of_sales',
 'qnt_of_sales_per_sub_commodity_desc',
 'qnt_of_sales_per_item_per_sub_commodity_desc_per_week',
 'marital_status_code',
 '0_y',
 '1_y',
 '2_y',
 '3_y',
 '4_y',
 '5_y',
 '6_y',
 '7_y',
 '8_y',
 '9_y',
 '10_y',
 '11_y',
 '12_y',
 '13_y',
 '14_y',
 '15_y',
 '16_y',
 '17_y',
 '18_y',
 '19_y',
 'mean_time',
 'age',
 'income',
 'avr_bask',
 'sum_per_week',
 'count_purchases_week_mean',
 'sum_purchases_week_mean']

In [17]:
lgbc.fit(X_train[important_features], y_train)

LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'marital_status_code', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [18]:
# preds = lgbc.predict(X_test[important_features])
test_preds_proba = lgbc.predict_proba(X_test[important_features])[:, 1]

In [29]:
# result = get_final_recomendations(X_test, test_preds_proba, data, train_1, item_features)
result = get_final_recomendations(X_test, test_preds_proba, data_test, train_1, item_features)

100%|██████████| 1885/1885 [05:33<00:00,  5.66it/s]


In [30]:
# price = train_1.groupby('item_id')['price'].mean().reset_index()

### Целевая метрика - precision@5 > 0.27%

In [31]:
# final_result = result.apply(lambda row: precision_at_k(row['recomendations'], row['actual'], price), axis=1).mean()
final_result = result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

In [32]:
final_result

0.7490716180371216

### Сохранение результатов

In [23]:
result.drop('actual', axis=1, inplace=True)

In [24]:
result.to_csv('recommendations.csv', index=False)