## Финальный проект

- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 0.27%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 

### Подключение библиотек и скриптов

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:72.5% !important; }</style>"))

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from implicit import als

from lightgbm import LGBMClassifier, Dataset
import lightgbm

from sklearn.model_selection import GridSearchCV

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
from src.preprocessing import new_item_features, new_user_features, train_test_preprocessing
from src.preprocessing import popularity_recommendation, get_important_features, get_final_recomendation

In [5]:
plt.rcParams.update({'font.size': 14})
pd.set_option('precision', 3)
pd.set_option('max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 300)

### Пути к директориям и файлам / Загрузка данных

In [6]:
data = pd.read_csv('data/retail_train.csv')
test_1 = pd.read_csv('data/retail_test1.csv')

item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

In [7]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

### Разбиение датасета на обучающую и две валидационные выборки

In [8]:
VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val_1 = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val_1.copy()

### First level model

In [9]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular = 777)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 778


In [10]:
recommender = MainRecommender(train_1)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=778.0), HTML(value='')))




In [11]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Add New Features

In [12]:
item_features = new_item_features(train_2, item_features, items_emb_df)
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,coupon_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,,,,,,,,,,,,,,,,,0,0.0,112255,0.13731,101,0.13467
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,,,,,,,,,,,,,,,,,0,0.0,226,0.05858,225,0.06443
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,,,,,,,,,,,,,,,,,0,0.0,2436,0.10227,356,0.14027
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,,,,,,,,,,,,,,,,,0,0.0,112255,0.13731,141,0.14157
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,,,,,,,,,,,,,,,,,0,0.0,112255,0.13731,183,0.11868


In [13]:
user_features = new_user_features(train_2, user_features, users_emb_df)
user_features.head()

Unnamed: 0,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,mean_time,age,income,kids,average_basket,sum_per_week
0,A,Homeowner,2 Adults No Kids,2,1,-0.98972,11.65656,12.14103,-20.26703,32.25811,3.87047,18.16497,17.74592,-1.41428,11.17883,14.47547,3.15616,6.64345,-8.53435,-3.2776,1324.80371,70,45,0,2.48729,44.35667
1,A,Homeowner,2 Adults No Kids,2,7,12.8309,17.50558,-9.37853,13.91107,-0.27647,6.38449,9.0137,-2.14391,3.22221,-0.81677,16.48158,6.29593,3.46122,13.24896,6.43395,1622.86292,50,70,0,2.54726,74.295
2,U,Unknown,2 Adults Kids,3,8,7.1858,10.82225,18.26776,7.07987,2.60226,8.52062,5.36119,5.69092,9.05097,6.64241,21.9811,18.83528,8.68491,10.10595,-7.61124,1824.99011,30,30,1,2.41379,81.66667
3,U,Homeowner,2 Adults Kids,4,13,-8.38492,0.23031,6.27797,2.63349,5.31923,21.22956,19.41909,5.49119,4.23955,3.98877,23.83595,16.72478,-1.78818,15.08645,4.67522,1608.36365,30,95,2,6.44618,177.27
4,B,Homeowner,Single Female,1,16,0.33884,0.74507,-0.62937,0.21601,4.25549,1.41309,1.93392,3.4936,-0.51308,0.50685,-0.73145,-2.16217,-1.65512,0.53238,-0.24137,1317.42859,50,70,0,2.26286,2.64


In [14]:
X_train, y_train = train_test_preprocessing(train_2, train_1, recommender, item_features, user_features)

In [15]:
cat_feats = X_train.select_dtypes(include=['object']).columns.tolist()
cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']

In [16]:
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

X_test_1, y_test_1 = train_test_preprocessing(test_1, train_1, recommender, item_features, user_features)
X_test_1[cat_feats + ['user_id', 'item_id']] = X_test_1[cat_feats + ['user_id', 'item_id']].astype('category')

### Second level model

In [23]:
lgbc = LGBMClassifier(objective='binary', categorical_feature=cat_feats)

### Отбор признаков

In [24]:
basic_feats = get_important_features(lgbс, X_train, y_train)
basic_feats

['user_id',
 'item_id',
 'manufacturer',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 '0_x',
 '1_x',
 '2_x',
 '3_x',
 '4_x',
 '5_x',
 '6_x',
 '7_x',
 '8_x',
 '9_x',
 '10_x',
 '11_x',
 '12_x',
 '13_x',
 '14_x',
 'coupon_disc',
 'quantity_of_sales',
 'qnt_of_sales_per_dep',
 'qnt_of_sales_per_item_per_dep_per_week',
 'qnt_of_sales_per_sub_commodity_desc',
 'qnt_of_sales_per_item_per_sub_commodity_desc_per_week',
 'hh_comp_desc',
 'household_size_desc',
 '0_y',
 '1_y',
 '2_y',
 '3_y',
 '4_y',
 '5_y',
 '6_y',
 '7_y',
 '8_y',
 '9_y',
 '10_y',
 '11_y',
 '12_y',
 '13_y',
 '14_y',
 'mean_time',
 'age',
 'income',
 'kids',
 'average_basket',
 'sum_per_week']

### Подбор параметров и обучение модели

In [25]:
# param_grid = {
#     'learning_rate': np.linspace(0.005, 0.03, num=5), 
#     'max_depth': [3, 4], 
#     'reg_alpha': np.linspace(0.000001, 0.0001, num=5), 
#     'reg_lambda': np.linspace(0.001, 0.05, num=5)
# }

In [26]:
# %%time
# gbm = GridSearchCV(lgb, param_grid, cv=5, n_jobs=-1)
# gbm.fit(X_train[basic_feats], y_train)

# print('Best parameters found by grid search are:', gbm.best_params_)

In [27]:
# params = gbm.best_params_
# lgb = LGBMClassifier(
#     objective='binary', 
#     categorical_feature=cat_feats, 
#     random_state=1, 
#     **params, 
#     n_jobs=-1, 
#     n_estimators=500
# )
lgbc.fit(X_train[basic_feats], y_train)

LGBMClassifier(categorical_feature=['manufacturer', 'department', 'brand',
                                    'commodity_desc', 'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               objective='binary')

### Целевая метрика - precision@5 > 0.27%

In [28]:
test_preds_proba = lgbc.predict_proba(X_test_1[basic_feats])[:, 1]

result = get_final_recomendation(X_test_1, test_preds_proba, test_1, train_1)

In [29]:
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

0.2835013262599455

In [30]:
X_test_2, y_test_2 = train_test_preprocessing(data, train_1, recommender, item_features, user_features)
X_test_2[cat_feats + ['user_id', 'item_id']] = X_test_2[cat_feats + ['user_id', 'item_id']].astype('category')

In [31]:
test_preds_proba = lgbc.predict_proba(X_test_2[basic_feats])[:, 1]

result = get_final_recomendation(X_test_2, test_preds_proba, data, train_1)

In [32]:
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

0.9714285714285718

### Сохранение результатов

In [33]:
result.drop('actual', axis=1, inplace=True)

In [34]:
result.to_csv('recommendations.csv', index=False)