# course project

---

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# %matplotlib inline

# from pyspark.ml.recommendation import ALS
# from pyspark.sql import SparkSession
# from pyspark.sql.types import DoubleType
# import pyspark.sql.functions as sf

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k, money_precision_at_k
from utils import prefilter_items
from recommenders import MainRecommender
from preprocessing import new_features, train_test_preprocessing, get_important_features, get_final_recomendation

In [None]:
pd.set_option('display.max_columns', None)

Загрузка данных и разделение на тренировочный и тестовый датасет: 

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Предобработка тренировочного датасета: 

In [None]:
n_items_before = data_train['item_id'].nunique()
data_train = prefilter_items(data_train, take_n_popular=5000, item_features=item_features)
n_items_after = data_train['item_id'].nunique()

print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


Создание объекта класса MainRecommender: 

In [None]:
recommender = MainRecommender(data_train)

Получение эмбеддингов: 

In [None]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

Создание item-user матрицы и получение новых фич (свойств): 

In [None]:
%%time

train = new_features(data, data_train, recommender, item_features, user_features, items_emb_df, users_emb_df, N)

train.head(2)

In [None]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [None]:
cat_feats=[]
for y in X_train.columns:
    if(X_train[y].dtype == np.object):
          cat_feats.append(y)
            
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

In [None]:
test = new_features(data, data_train, recommender, item_features, user_features, items_emb_df, users_emb_df, N)

X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_feats + ['user_id', 'item_id']] = X_test[cat_feats + ['user_id', 'item_id']].astype('category')

Выделение признаков влияющих на результат предсказания модели: 

In [None]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
basic_feats = get_important_features(lgb, X_train, y_train)

In [None]:
%%time

lgb = LGBMClassifier(
    objective='binary',
    max_depth=7,
    categorical_feature=cat_feats
)
lgb.fit(X_train[basic_feats], y_train)

In [None]:
%%time

preds = lgb.predict(X_test[basic_feats])
test_preds_proba = lgb.predict_proba(X_test[basic_feats])[:, 1]

In [None]:
result_train = get_final_recomendation(X_test, test_preds_proba, data, data_train, item_features)

In [None]:
df_price = train_1.groupby('item_id')['price'].mean().reset_index()

Получение money precision @k для тренировочного датасета: 

In [None]:
result_train.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.8981900760304122

In [None]:
test_2 = new_features(data_test, data_train, recommender, item_features, user_features, items_emb_df, users_emb_df, N)

X_test_2 = test_2.drop(['target'], axis=1)
y_test_2 = test_2[['target']]
X_test_2[cat_feats + ['user_id', 'item_id']] = X_test_2[cat_feats + ['user_id', 'item_id']].astype('category')

In [None]:
test_preds_proba = lgb.predict_proba(X_test_2[basic_feats])[:, 1]
result = get_final_recomendation(X_test_2, test_preds_proba, test_1, train_1, item_features)

Получение money precision @k для тестового датасета:

In [None]:
result.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.5570026525198939

In [None]:
result.drop('actual', axis=1, inplace=True)

In [None]:
result.to_csv('recommendations.csv', index=False)