# Course project


**Основное**
- Дедлайн - 19 февраля 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


# Set global const

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 110 

# Process features dataset

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [6]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 7 недель -- | -- 4 недель -- 
# подобрать размер 2-ого датасета (7 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
VAL_MATCHER_WEEKS = 11
VAL_RANKER_WEEKS = 3

In [7]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [8]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [9]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [10]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (1968369, 12) Users: 2498 Items: 80902
val_matcher
Shape: (310121, 12) Users: 2293 Items: 35272
train_ranker
Shape: (310121, 12) Users: 2293 Items: 35272
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [11]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [12]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1963047,2375,40186322971,559,827667,2,6.0,364,-1.78,34,81,0.0,0.0
1963048,2375,40186322971,559,834631,1,1.69,364,0.0,34,81,0.0,0.0


# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

n_popular = 1500

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=n_popular)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 80902 to 1501


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

#выбираем товары из подготовленного набора
prefiltered_items = list(set(data_train_matcher['item_id']))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

#оставляем подготовленные товары по всем массивам данных
#data_val_matcher = data_val_matcher[data_val_matcher.item_id.isin(prefiltered_items)]
#data_train_ranker = data_train_ranker[data_train_ranker.item_id.isin(prefiltered_items)]
#data_val_ranker = data_val_ranker[data_val_ranker.item_id.isin(prefiltered_items)]


print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (739351, 13) Users: 1981 Items: 1501
val_matcher
Shape: (297421, 12) Users: 1981 Items: 34563
train_ranker
Shape: (297421, 12) Users: 1981 Items: 34563
val_ranker
Shape: (117148, 12) Users: 1981 Items: 24216


In [15]:
data_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19
12,1364,26984896261,1,999999,1,2.99,31742,-0.4,1520,1,0.0,0.0,2.99
13,1364,26984896261,1,999999,1,3.09,31742,0.0,1520,1,0.0,0.0,3.09
14,1364,26984896261,1,999999,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# Init/train recommender

In [16]:
recommender = MainRecommender(data_train_matcher, weighting='bm25', B=0.895, 
                              n_factors=200, regularization=0.001, iterations=35, num_threads=4, 
                              K=1)



  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/1501 [00:00<?, ?it/s]

### Варианты, как получить кандидатов

Можно потом все эти варианты соединить в один

(!) Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

# Eval recall of matching

### Измеряем recall@k

In [17]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[840361, 856942, 857006, 859676, 868006, 87737..."
1,3,"[833723, 843744, 844839, 910032, 913210, 92134..."


In [18]:
def recommended_items(calc_type, users_list, number_items, function):
    if calc_type == 0:
        return [function(user, number_items) for user in users_list]
    elif calc_type == 1:
        return users_list.apply(lambda x: function(x, N=number_items))
    else:
        return list(map(lambda x: function(x, N=number_items), users_list))

In [19]:
#result_eval_matcher['own_rec'] = recommended_items(0, result_eval_matcher[USER_COL], N_PREDICT, recommender.get_own_recommendations)

### Пример оборачивания

In [20]:
def evalRecall(df_result, target_col_name, recommend_model, result_col_name='result', 
               actual_col = 'actual', N_PREDICT=50, inplace = False):
    df_result[result_col_name] = recommended_items(0, df_result[target_col_name], N_PREDICT, recommend_model)
    result = np.mean([recall_at_k(a, b, k=N_PREDICT) for a,b in zip(df_result[result_col_name], df_result[actual_col])])
    if not(inplace):
        df_result.drop(columns=result_col_name, inplace=True)
    return result

In [21]:
def evalPrecision(df_result, target_col_name, recommend_model, result_col_name='result', 
                  actual_col = 'actual', N_PREDICT=50, inplace = False):
    df_result[result_col_name] = recommended_items(0, df_result[target_col_name], N_PREDICT, recommend_model)
    result = np.mean([precision_at_k(a, b, k=N_PREDICT) for a,b in zip(df_result[result_col_name], df_result[actual_col])])
    if not(inplace):
        df_result.drop(columns=result_col_name, inplace=True)
    return result

In [22]:
# evalRecall(result_eval_matcher, USER_COL, recommender.get_own_recommendations)

In [23]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [24]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Recall@50 of matching

In [25]:
TOPK_RECALL = 50

In [26]:
#sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

### Precision@5 of matching

In [27]:
TOPK_PRECISION = 5

In [28]:
#sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах

In [29]:
# -- давние покупки -- | -- 7 недель -- | -- 4 недель -- 

## Подготовка данных для трейна

In [30]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [31]:
# собираем кандитатов с первого этапа (matcher)
#df_match_candidates['candidates'] = recommended_items(0, df_match_candidates[USER_COL], N_PREDICT, recommender.get_own_recommendations)
df_match_candidates['candidates'] = recommended_items(0, df_match_candidates[USER_COL], N_PREDICT, recommender.get_als_recommendations)

In [32]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2375,"[1046545, 999104, 1000753, 899624, 902172, 101..."
1,989,"[8090509, 985999, 957951, 1020581, 893018, 880..."


In [33]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [34]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [35]:
#df_match_candidates.head(4)

### Check warm start

In [36]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (217910, 2) Users: 1981 Items: 1482


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [37]:
data_train_ranker.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1963047,2375,40186322971,559,827667,2,6.0,364,-1.78,34,81,0.0,0.0
1963048,2375,40186322971,559,834631,1,1.69,364,0.0,34,81,0.0,0.0
1963049,2375,40186322971,559,855914,1,2.79,364,0.0,34,81,0.0,0.0
1963050,2375,40186322971,559,860776,2,1.58,364,0.0,34,81,0.0,0.0
1963051,2375,40186322971,559,872062,1,2.59,364,0.0,34,81,-1.0,0.0


In [38]:
def df_preparation(data, keep_columns=[USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 
                                       'week_no', 'store_id']):
    data = data[keep_columns].copy()
    return data

In [39]:
#df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']].copy()
#df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']].copy()
columns = [USER_COL, ITEM_COL, 'basket_id', 'quantity', 'sales_value', 'retail_disc', 'week_no', 'store_id']

df_ranker_train = df_preparation(data_train_ranker, keep_columns=columns)
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [40]:
df_ranker_train.target.value_counts()

0.0    205632
1.0     22234
Name: target, dtype: int64

(!) На каждого юзера 50 item_id-кандидатов

In [41]:
df_ranker_train['target'].mean()

0.0975748905058236

## Подготавливаем фичи для обучения модели

### Описательные фичи

In [42]:
#item_features.head(2)
#user_features.head(2)

In [43]:
def df_merge_features(data, item_features, user_features):
    data = data.merge(item_features, on='item_id', how='left')
    data = data.merge(user_features, on='user_id', how='left')
    return data

In [44]:
df_ranker_train = df_merge_features(df_ranker_train, item_features, user_features)


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [45]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,basket_id,quantity,sales_value,retail_disc,week_no,store_id,target,manufacturer,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,1046545,,,,,,,0.0,69,...,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,,,,,,,
1,2375,999104,40283390000.0,1.0,2.69,-0.15,82.0,364.0,1.0,1194,...,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64OZ,,,,,,,
2,2375,1000753,,,,,,,0.0,2822,...,BEEF,SELECT BEEF,,,,,,,,
3,2375,899624,40186320000.0,4.0,15.96,0.0,81.0,364.0,1.0,69,...,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,,,,,,,
4,2375,899624,40595020000.0,2.0,5.98,-2.0,85.0,364.0,1.0,69,...,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,,,,,,,


## !!! Пока выполните нотбук без этих строк, потом вернитесь и запустите их, обучите ранкер и посмотрите на метрики с ранжированием

In [46]:
def df_add_features(data, initial_df, ITEM_COL='item_id', USER_COL='user_id'):
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/initial_df.week_no.nunique(), how='left',on=ITEM_COL)
    data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_user')/initial_df.user_id.nunique(), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/initial_df.week_no.nunique(), how='left',on=USER_COL)
    #data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/initial_df.basket_id.nunique(), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=ITEM_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/initial_df.basket_id.nunique(), how='left',on=USER_COL)
    #data = data.merge(initial_df.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/initial_df.basket_id.nunique(), how='left',on=ITEM_COL)
    #data = data.merge(initial_df.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/initial_df.basket_id.nunique(), how='left',on=USER_COL)
    return data

In [47]:
"""df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_user')/df_join_train_matcher.user_id.nunique(), how='left',on=ITEM_COL)

#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

#df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)
"""

df_ranker_train = df_add_features(df_ranker_train, df_join_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)

In [48]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [49]:
def train_data_preparation(data):
    data['quantity'].fillna(value=0, inplace=True)
    data['sales_value'].fillna(value=0, inplace=True)
    data['retail_disc'].fillna(value=0, inplace=True)
    data['age_desc'].fillna(value='19-65+', inplace=True)
    data['marital_status_code'].fillna( value='C', inplace=True)
    data['income_desc'].fillna( value='0-250K+', inplace=True)
    data['homeowner_desc'].fillna( value='Unknown', inplace=True)
    data['hh_comp_desc'].fillna( value='Unknown', inplace=True)
    data['household_size_desc'].fillna( value='Unknown', inplace=True)
    data['kid_category_desc'].fillna( value='None/Unknown', inplace=True)
    data['store_id'] = X_train['store_id'].astype(str)
    data['basket_id'] = X_train['basket_id'].astype(str)
    data.replace('nan', 'U', inplace=True)
    data['week_no'].fillna(value=0, inplace=True)
    data['week_no'] = data['week_no'].astype(int)
    
    return data


In [50]:
X_train = train_data_preparation(X_train)

In [51]:
#unimportant_features = ['brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 
#                        'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 
#                        'household_size_desc', 'kid_category_desc', 'item_quantity_per_week'
#                       ]
"""unimportant_features = ['basket_id']
X_train.drop(columns = unimportant_features, inplace=True)"""

"unimportant_features = ['basket_id']\nX_train.drop(columns = unimportant_features, inplace=True)"

In [52]:
num_feats = [col for col in X_train.columns if (X_train[col].dtype == 'int64') or (X_train[col].dtype == 'float64')]
num_feats = set(num_feats) - set(['manufacturer','user_id', 'item_id', 'week_no'])
#num_feats

In [53]:
cat_feats = X_train.columns[0:].tolist()
cat_feats = list(set(cat_feats) - set(num_feats))
#cat_feats

In [54]:
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [55]:
#X_train.info()

## Обучение модели ранжирования

In [56]:
# week for validation
#data = X_train
#data['target'] = y_train

#X_val = data[data['week_no'] >= data['week_no'].max() - 2]
#X_train = data[data['week_no'] < data['week_no'].max() - 2]

#y_train = X_train[['target']]
#X_train = X_train.drop('target', axis=1)

#y_val = X_val[['target']]
#X_val = X_val.drop('target', axis=1)

In [57]:
#feature_weight = {"week_no":0.005,"store_id":0.01, "quantity":0.01, "sales_value":0.01, 
#                  "retail_disc":0.01, "basket_id":0.11, "retail_disc":0.005, "item_id":0.1, "user_id":0.1}
feature_weight = {"week_no":0.08, "sales_value":0.083, "quantity":0.084, "store_id":0.08,
                  "retail_disc":0.5, 'basket_id':0}
#feature_weight = {"quantity":1}

model = CatBoostClassifier(
    iterations=70,
    learning_rate=0.05,
    depth=10,
    random_seed=12,
    logging_level='Silent',
    cat_features=cat_feats,
    feature_weights = feature_weight,
    custom_metric=['Logloss', 'Precision', 'F1', 'Recall']
)
#use_best_model=True

model.fit(
    X_train, y_train,
    plot=True
)
#eval_set=(X_val, y_val),

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fa11724c340>

In [58]:
#model.get_feature_importance(prettified = True)

In [59]:
"""from hyperopt import fmin, tpe, hp, Trials
n_iter = 30
random_st = 12

def score_func(params, data=df_ranker_train, X_train=X_train,
               y_train=y_train, X_val=X_val, y_val=y_val,
               data_val=item_feat_lightfm, random_state_val=random_st,
               TOPK_PRECISION = 5
              ):
    
    # the function gets a set of variable parameters in "param"
    model_params = {'iterations': int(params['iterations']),
                    'learning_rate': params['learning_rate'],
                    'depth': int(params['depth']),
                    'logging_level': params['logging_level'],
                    'cat_features': params['cat_features'],
                    'feature_weights': f'({",".join([str(item) for item in params['feature_weights']])})'
                   }
    
    # we use this params to create a new CatBoost model
    model = CatBoostClassifier(random_seed=random_state_val, **model_params)
    
    #use_best_model=True
    model.fit(X_train, y_train, plot=False)
    
    train_preds = model.predict_proba(X_train)
    data['proba_item_purchase'] = train_preds[:,1]
    result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
    result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
    def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()
    result_eval_ranker['reranked_als_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))
    current_precision = sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)[0][1]

    # precision
    if current_precision > 0:
        score = 1/current_precision
    else:
        score = 1e100
    
    return score

param={'iterations': hp.uniform('iterations', 10, 100),
       'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
       'depth': hp.uniform('depth', 1, 12),
       'logging_level': 'Silent',
       'cat_features': cat_feats,
       'feature_weights': [hp.uniform('feature_weights', 0.0, 1) for i in range(len(X_train.columns))]
      }

%%time

best=fmin(score_func, # function to optimize
          space=param, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
         )
# computing the score on the test set
model = CatBoostClassifier(random_state=random_st, no_components=int(best['no_components']),
                learning_rate=best['learning_rate'],item_alpha=best['item_alpha'],
                user_alpha=best['user_alpha'], loss='warp'
               )"""

'from hyperopt import fmin, tpe, hp, Trials\nn_iter = 30\nrandom_st = 12\n\ndef score_func(params, data=df_ranker_train, X_train=X_train,\n               y_train=y_train, X_val=X_val, y_val=y_val,\n               data_val=item_feat_lightfm, random_state_val=random_st,\n               TOPK_PRECISION = 5\n              ):\n    \n    # the function gets a set of variable parameters in "param"\n    model_params = {\'iterations\': int(params[\'iterations\']),\n                    \'learning_rate\': params[\'learning_rate\'],\n                    \'depth\': int(params[\'depth\']),\n                    \'logging_level\': params[\'logging_level\'],\n                    \'cat_features\': params[\'cat_features\'],\n                    \'feature_weights\': f\'({",".join([str(item) for item in params[\'feature_weights\']])})\'\n                   }\n    \n    # we use this params to create a new CatBoost model\n    model = CatBoostClassifier(random_seed=random_state_val, **model_params)\n    \n   

In [60]:
#train_preds = model.predict_proba(data.drop('target', axis=1))
train_preds = model.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
#df_ranker_predict.head(12)

## Подведем итоги

    Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от als_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [61]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [62]:
%%time
result_eval_ranker['own_rec'] = recommended_items(0, result_eval_ranker[USER_COL], N_PREDICT, recommender.get_own_recommendations)
result_eval_ranker['als_rec'] = recommended_items(0, result_eval_ranker[USER_COL], N_PREDICT, recommender.get_als_recommendations)

CPU times: user 50 s, sys: 1.7 s, total: 51.7 s
Wall time: 28.5 s


In [63]:
f'TOPK_PRECISION = {TOPK_PRECISION}'

'TOPK_PRECISION = 5'

In [64]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

#sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [65]:
def rerank(user_id, df = df_ranker_predict):
    return df[df[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [66]:
data_val_ranker = df_preparation(data_val_ranker, keep_columns=columns)
data_val_ranker = df_merge_features(data_val_ranker, item_features, user_features)
data_val_ranker = df_add_features(data_val_ranker, df_join_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)
data_val_ranker = train_data_preparation(data_val_ranker)


train_preds = model.predict_proba(data_val_ranker)
data_val_ranker['proba_item_purchase'] = train_preds[:,1]

result_eval_ranker['reranked_als_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id, data_val_ranker))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True),  f'TOPK_PRECISION = {TOPK_PRECISION}', sep='\n')

('reranked_als_rec', 0.9606511862695566)
('als_rec', 0.13084300858152378)
('own_rec', 0.02796567390206973)
TOPK_PRECISION = 5


In [67]:
#precision@5 >= 0.25

In [68]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим

# Оценка на тесте для выполнения курсового проекта

In [69]:
TOPK_PRECISION = 5

df_test = pd.read_csv('retail_test1.csv')
#df_transactions = pd.read_csv('retail_train.csv')

In [70]:
#df_test = df_test[df_test.user_id.isin(common_users)]

In [71]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [72]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [73]:
df_test = df_preparation(df_test, keep_columns=columns)
df_test = df_merge_features(df_test, item_features, user_features)
df_test = df_add_features(df_test, df_join_train_matcher, ITEM_COL=ITEM_COL, USER_COL=USER_COL)
df_test = train_data_preparation(df_test)

train_preds = model.predict_proba(df_test)
df_test['proba_item_purchase'] = train_preds[:,1]

In [74]:
#sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

In [75]:
result_test['reranked_als_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id, df_test))

In [76]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True),  f'TOPK_PRECISION = {TOPK_PRECISION}', sep='\n')

('reranked_als_rec', 0.9564456233421706)
TOPK_PRECISION = 5
