# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [79]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2109568,856,40630539494,593,1043142,1,1.99,372,-1.00,1831,85,0.0,0.0
2109569,856,40630539494,593,1120213,1,1.67,372,0.00,1831,85,0.0,0.0
2109570,856,40630539494,593,1132814,1,5.69,372,-0.30,1831,85,0.0,0.0
2109571,856,40630539494,593,9420044,1,10.99,372,-3.30,1831,85,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))
data_train_lvl_1

Decreased # items from 83685 to 5001


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2109568,856,40630539494,593,999999,1,1.99,372,-1.00,1831,85,0.0,0.0
2109569,856,40630539494,593,1120213,1,1.67,372,0.00,1831,85,0.0,0.0
2109570,856,40630539494,593,999999,1,5.69,372,-0.30,1831,85,0.0,0.0
2109571,856,40630539494,593,999999,1,10.99,372,-3.30,1831,85,0.0,0.0


In [4]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [5]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1004906, 1000753, 1046545, 871756]

In [6]:
recommender.get_own_recommendations(2375, N=5)

[1036501, 1079023, 1085983, 907099, 910439]

In [7]:
recommender.get_similar_items_recommendation(2375, N=5)

[868764, 889731, 1055646, 1046545, 981760]

In [8]:
recommender.get_similar_users_recommendation(2375, N=5)

[1065538, 1073150, 1082212, 822677, 1067443]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']

# Для ускорения расчетов берем 1000 юзеров
result_lvl_1 = result_lvl_1[:1000].copy()

In [10]:
# your_code

Задание 1.А

In [11]:
k = 50

In [12]:
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda user: recommender.get_als_recommendations(user,k))
result_lvl_1.apply(lambda row: recall_at_k(row['als'], row['actual']), axis=1).mean()

0.017051177657295916

In [13]:
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda user: recommender.get_own_recommendations(user,k))
result_lvl_1.apply(lambda row: recall_at_k(row['own'], row['actual']), axis=1).mean()

0.030228510213009537

In [14]:
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda user: recommender.get_similar_items_recommendation(user,k))
result_lvl_1.apply(lambda row: recall_at_k(row['similar_items'], row['actual']), axis=1).mean()

0.011501073542851068

Own recommendations дают лучший результат

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [15]:
# your_code
result_lvl_1

Unnamed: 0,user_id,actual,als,own,similar_items
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1104349, 1131488, 905436, 1082185, 1094924, 8...","[856942, 9297615, 5577022, 1074612, 9655212, 9...","[12352293, 999999, 981760, 1098066, 12349795, ..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1041259, 1106523, 916122, 1004906, 1033142, 5...","[1076580, 911974, 826784, 1083296, 838136, 820...","[1137346, 1133018, 5569845, 1106523, 985999, 8..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[1119454, 902172, 891423, 8090521, 846550, 999...","[891423, 910109, 887003, 1121367, 951821, 1115...","[865528, 926905, 1074754, 1055425, 930917, 880..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1082185, 878996, 857006, 930118, 863632, 9652...","[13003092, 1119051, 9911484, 8203834, 1108094,...","[999999, 904360, 825541, 845208, 948650, 55698..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[853643, 1003188, 1130111, 938187, 912553, 598...","[845814, 1075524, 1097544, 1112957, 9338009, 6...","[999999, 849843, 1015247, 1094955, 9488055, 11..."
...,...,...,...,...,...
995,1159,"[839191, 985399, 1050741, 1065593, 1085256, 80...","[878996, 1082185, 8203451, 911622, 1022254, 11...","[8090556, 1054483, 1050741, 1028422, 1011312, ...","[882305, 999999, 1051323, 6533362, 5569379, 55..."
996,1160,"[833025, 844179, 849843, 859075, 871756, 89516...","[5592931, 1132770, 1050131, 1096036, 1043590, ...","[947267, 1106520, 900370, 845109, 1108654, 100...","[1106523, 1133018, 1106301, 1071939, 897678, 9..."
997,1161,"[868971, 913201, 1085256, 8090537, 10285144, 8...","[1082185, 831536, 6534178, 994928, 995242, 981...","[992622, 1006555, 822677, 1039081, 921288, 962...","[1058997, 894447, 862349, 1116821, 12731808, 1..."
998,1163,"[6534178, 825343, 899624, 903325, 921836, 9601...","[1082185, 840361, 995242, 961554, 1127831, 981...","[960142, 1070803, 926763, 12263788, 834236, 55...","[981760, 882305, 999999, 1030819, 9834988, 104..."


In [16]:
users_lvl_2 = pd.DataFrame(result_lvl_1['user_id'].unique())
users_lvl_2.columns = ['user_id']
users_lvl_2['candidates'] = result_lvl_1['own']
users_lvl_2

Unnamed: 0,user_id,candidates
0,1,"[856942, 9297615, 5577022, 1074612, 9655212, 9..."
1,2,"[1076580, 911974, 826784, 1083296, 838136, 820..."
2,4,"[891423, 910109, 887003, 1121367, 951821, 1115..."
3,6,"[13003092, 1119051, 9911484, 8203834, 1108094,..."
4,7,"[845814, 1075524, 1097544, 1112957, 9338009, 6..."
...,...,...
995,1159,"[8090556, 1054483, 1050741, 1028422, 1011312, ..."
996,1160,"[947267, 1106520, 900370, 845109, 1108654, 100..."
997,1161,"[992622, 1006555, 822677, 1039081, 921288, 962..."
998,1163,"[960142, 1070803, 926763, 12263788, 834236, 55..."


In [17]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1
users_lvl_2

Unnamed: 0,user_id,item_id,flag
0,1,856942,1
0,1,9297615,1
0,1,5577022,1
0,1,1074612,1
0,1,9655212,1
...,...,...,...
999,1164,824305,1
999,1164,1113274,1
999,1164,5589247,1
999,1164,6633342,1


In [68]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,1,856942,1.0
1,1,856942,1.0
2,1,9297615,1.0
3,1,9297615,1.0
4,1,5577022,1.0
...,...,...,...
53723,1164,824305,0.0
53724,1164,1113274,0.0
53725,1164,5589247,0.0
53726,1164,6633342,0.0


In [69]:
targets_lvl_2['target'].mean()

0.1832191780821918

In [70]:
item_features.columns

Index(['item_id', 'manufacturer', 'department', 'brand', 'commodity_desc',
       'sub_commodity_desc', 'curr_size_of_product'],
      dtype='object')

In [71]:
user_col = ['user_id', 'age_desc', 'marital_status_code', 'income_desc']
item_col = ['item_id', 'department', 'brand','commodity_desc']

In [72]:
user_feat = pd.DataFrame(user_features, columns=user_col)
user_feat.set_index('user_id', inplace=True)
user_feat = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat = pd.DataFrame(item_features, columns=item_col)
item_feat.set_index('item_id', inplace=True)
item_feat = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [73]:
targ_lvl_2 = targets_lvl_2.copy()
targ_lvl_2 = targ_lvl_2.merge(item_feat, on='item_id', how='left')
targ_lvl_2 = targ_lvl_2.merge(user_feat, on='user_id', how='left')
targ_lvl_2

Unnamed: 0,user_id,item_id,target,department_,department_AUTOMOTIVE,department_CHARITABLE CONT,department_CHEF SHOPPE,department_CNTRL/STORE SUP,department_COSMETICS,department_COUP/STR & MFG,...,income_desc_15-24K,income_desc_150-174K,income_desc_175-199K,income_desc_200-249K,income_desc_25-34K,income_desc_250K+,income_desc_35-49K,income_desc_50-74K,income_desc_75-99K,income_desc_Under 15K
0,1,856942,1.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,856942,1.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,9297615,1.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,9297615,1.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,5577022,1.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53723,1164,824305,0.0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53724,1164,1113274,0.0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53725,1164,5589247,0.0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53726,1164,6633342,0.0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
X_train = targ_lvl_2.drop('target', axis=1)
y_train = targ_lvl_2[['target']]

In [75]:
model = XGBClassifier()
model.fit(X_train, y_train)
train_preds = model.predict(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




In [76]:
targ = pd.concat([targets_lvl_2, pd.DataFrame(train_preds, columns=['pred'])], axis=1)
targ

Unnamed: 0,user_id,item_id,target,pred
0,1,856942,1.0,1.0
1,1,856942,1.0,1.0
2,1,9297615,1.0,1.0
3,1,9297615,1.0,1.0
4,1,5577022,1.0,0.0
...,...,...,...,...
53723,1164,824305,0.0,0.0
53724,1164,1113274,0.0,0.0
53725,1164,5589247,0.0,0.0
53726,1164,6633342,0.0,0.0


Как из этих данных посчитать precision@5 не знаю, наверняка есть типовое решение, на вебинаре не разбиралось. И это на трейне. На тесте предикт не проходит, так как на трейне и на тесте количество фичей не совпадает (из-за get_dummies).

### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 