# Курсовой проект

**Основное**
- Дедлайн - 27 декабря 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

### Подключаем библиотеки

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
from tqdm import tqdm
tqdm.pandas()

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items, extend_user_item_new_features, get_important_features, get_popularity_recommendations, postfilter_items
from recommenders import MainRecommender

### Чтение данных

In [10]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')
data_test = pd.read_csv('data/retail_test1.csv')

In [11]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


### Установка глобальных констант

In [12]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

### Процес выбора фичей из набора данных

In [13]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

### 1. Разделение набора данных на обучающий, проверочной, тестовый

In [14]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [15]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [16]:
# сделаем объединенный сет данных из обучающей и проверочной выборках для первого уровня
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [17]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [18]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


### Предобработка выборок

In [19]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


### Делаем холодный старт для теплого старта

In [20]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (967226, 13) Users: 1910 Items: 5001
val_matcher
Shape: (163165, 12) Users: 1910 Items: 27115
train_ranker
Shape: (163165, 12) Users: 1910 Items: 27115
val_ranker
Shape: (115813, 12) Users: 1910 Items: 24036


### 2. Инициализация и обучение модели первого уровня MainRecommender

In [21]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

### Эмбеддинги признаков

In [28]:
items_embeding_df = recommender.items_emb_df
users_embeding_df = recommender.users_emb_df

In [30]:
train = extend_user_item_new_features(data_train_ranker, data_train_matcher, recommender, item_features, user_features, items_embeding_df, users_emb_df, N_PREDICT)
train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,19_y,mean_time,age,income,children,avr_bask,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,...,1.722283,1274.421509,50.0,70.0,0.0,2.290045,77.86153,0.000686,0.002871,0.0
1,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002571,0.00263,0.0
2,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002721,0.002794,1.0
3,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002721,0.002794,1.0
4,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003986,0.005455,0.0


In [77]:
cat_features=[]
for col in X_train.columns:
    if(X_train[col].dtype == np.object):
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if(X_train[col].dtype == np.object):


In [37]:
cat_features

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']

In [40]:
test = extend_user_item_new_features(data_test, data_train_matcher, recommender, item_features, user_features, users_embeding_df, users_emb_df, N_PREDICT)
X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

### 3. Построение модели второго уровня

In [42]:
lgbc = LGBMClassifier(objective='binary', 
                      max_depth=4, 
                      categorical_column=cat_features)

In [103]:
important_features = get_important_features(lgbc, X_train, y_train)


  return f(*args, **kwargs)


In [46]:
lgbc.fit(X_train[important_features], y_train)

  return f(*args, **kwargs)


LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'marital_status_code', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc'],
               max_depth=4, objective='binary')

In [57]:
preds = lgbc.predict(X_test[important_features])
test_preds_proba = lgbc.predict_proba(X_test[important_features])[:, 1]

### Оценка на тестовом наборе данных

In [101]:
def get_recommendate(X_test, preds_proba, data, train_1, item_features):
    """Рекомендованные товары"""

    X_test['predict_proba'] = preds_proba

    X_test.sort_values(['user_id', 'predict_proba'], ascending=False, inplace=True)
    recs = X_test.groupby('user_id')['item_id']
    recomendations = []
    for user, preds in recs:
        recomendations.append({'user_id': user, 'recomendations': preds.tolist()})

    recomendations = pd.DataFrame(recomendations)

    result_2 = data.groupby('user_id')['item_id'].unique().reset_index()
    result_2.columns = ['user_id', 'actual']

    result = result_2.merge(recomendations, how='left')
    result['recomendations'] = result['recomendations'].fillna(0)

    price = train_1.groupby('item_id')['price'].mean().reset_index()

    pop_rec = get_popularity_recommendations(train_1, n=500)
    list_pop_rec = []
    [list_pop_rec.append(item) for item in pop_rec if price \
        .loc[price['item_id'] == item]['price'].values > 1]

    result['recomendations'] = result.progress_apply \
        (lambda x: postfilter_items(x, item_info=item_features, train_1=train_1, price=price, list_pop_rec=list_pop_rec,
                                    N=5), axis=1)

    return result

In [102]:
result = get_recommendate(X_test, test_preds_proba, data, data_train_matcher, item_features)

100%|██████████████████████████████████████████████████████████████████████████████| 2499/2499 [18:07<00:00,  2.30it/s]


In [60]:
final_result = result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

In [61]:
final_result

0.44025610244097685