In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
warnings.filterwarnings('ignore')
%matplotlib inline

Загружаем данные

In [3]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

In [4]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [5]:
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [6]:
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [7]:
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

In [8]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [9]:
user_features.columns = [col.lower() for col in user_features.columns]
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


Разбиваем датасет на три части - давние покупки, 6 недель, 3 - недели.

In [10]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


Производим предварительную фильтрацию, выбрав только 5000 уникальных товаров

In [11]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = data_train_lvl_1['item_id'].nunique()
print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 83685 to 5001


Создаем рекомендательную модель 1 уровня

In [12]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




Сгруппируем данные для второй части данных

In [13]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1['actual'] = list(result_lvl_1['actual'])
result_lvl_1.head()

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


Для модели первого уровня выберем модель, рекомендующую 50 кандидатов среди тех товаров, которые юзер уже купил. Найдем кандидатов для второго уровня модели

In [14]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
users_lvl_2.head()

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."
2,1753,"[967041, 963686, 948640, 1057168, 942475, 9553..."
3,2120,"[5707857, 1029743, 1106523, 5569230, 916122, 8..."
4,1346,"[1135983, 5569309, 1129982, 5574377, 5569993, ..."


Тренировочный набор данных для второй модели

In [15]:
data_train_lvl_2.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0


Сгруппируем данные по парам покупатель - товар

In [16]:
df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})
df.head()

Unnamed: 0,user_id,item_id
0,2070,1105426
1,2070,1097350
2,2070,879194
3,2070,948640
4,2070,928263


Добавим два признака 'quantity', 'sales_value'

In [17]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id', 'quantity', 'sales_value']].copy()
targets_lvl_2['target'] = 1
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,target
2104867,2070,1019940,1,1.0,1
2107468,2021,840361,1,0.99,1
2107469,2021,856060,1,1.77,1
2107470,2021,869344,1,1.67,1
2107471,2021,896862,2,5.0,1


Соединим данные

In [18]:
targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2.head(10)

Unnamed: 0,user_id,item_id,quantity,sales_value,target
0,2070,1105426,,,
1,2070,1097350,,,
2,2070,879194,,,
3,2070,948640,,,
4,2070,928263,,,
5,2070,944588,,,
6,2070,1032703,,,
7,2070,10198378,,,
8,2070,1138596,,,
9,2070,1092937,1.0,3.29,1.0


Заменим NAN на 0

In [19]:
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2['quantity'].fillna(0, inplace= True)
targets_lvl_2['sales_value'].fillna(0, inplace= True)

targets_lvl_2.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,target
0,2070,1105426,0.0,0.0,0.0
1,2070,1097350,0.0,0.0,0.0
2,2070,879194,0.0,0.0,0.0
3,2070,948640,0.0,0.0,0.0
4,2070,928263,0.0,0.0,0.0


Добавим признаки покупателей и признаки товаров

In [20]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,0.0,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,0.0,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,0.0,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,0.0,0.0,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,928263,0.0,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


Поделим данные на X и Y

In [21]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]
X_train.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,0.0,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,928263,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


Создадим список категорийных признаков

In [22]:
cat_feats = X_train.columns[4:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

Для модели второго уровня выберем LGBMRanker

In [23]:
from lightgbm import LGBMRanker

lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 500,
    'learning_rate': 0.1,
    'max_depth': 4,
    'categorical_column': cat_feats,
    'random_state': 27,
    'verbose': 1,
    'is_unbalance': True
}

Создадим группы запросов

In [24]:
X_train['num'] = np.arange(len(X_train))
gr = X_train[['user_id', 'num']].groupby('user_id', sort=False).max()['num'].values
grs = np.concatenate((np.array([gr[0]+1]), gr[1:] - gr[:-1]))

Проверим правильность групп

In [25]:
X_train.shape

(111366, 18)

In [26]:
y_train.shape

(111366, 1)

In [27]:
np.sum(grs)

111366

Группы созданы правильно. Натренируем модель

In [28]:
lgb = LGBMRanker(**lgb_params, silent=False)
lgb.fit(X_train, y_train, group=grs,
        eval_set=[(X_train, y_train)], eval_group=[grs], 
        
        eval_metric=['ndcg'],
        eval_at=[5, 10 ], early_stopping_rounds=50)

[1]	training's ndcg@5: 0.999742	training's ndcg@10: 0.99967
Training until validation scores don't improve for 50 rounds
[2]	training's ndcg@5: 0.999742	training's ndcg@10: 0.99967
[3]	training's ndcg@5: 1	training's ndcg@10: 0.999963
[4]	training's ndcg@5: 1	training's ndcg@10: 1
[5]	training's ndcg@5: 1	training's ndcg@10: 1
[6]	training's ndcg@5: 1	training's ndcg@10: 1
[7]	training's ndcg@5: 1	training's ndcg@10: 1
[8]	training's ndcg@5: 1	training's ndcg@10: 1
[9]	training's ndcg@5: 1	training's ndcg@10: 1
[10]	training's ndcg@5: 1	training's ndcg@10: 1
[11]	training's ndcg@5: 1	training's ndcg@10: 1
[12]	training's ndcg@5: 1	training's ndcg@10: 1
[13]	training's ndcg@5: 1	training's ndcg@10: 1
[14]	training's ndcg@5: 1	training's ndcg@10: 1
[15]	training's ndcg@5: 1	training's ndcg@10: 1
[16]	training's ndcg@5: 1	training's ndcg@10: 1
[17]	training's ndcg@5: 1	training's ndcg@10: 1
[18]	training's ndcg@5: 1	training's ndcg@10: 1
[19]	training's ndcg@5: 1	training's ndcg@10: 1
[20

[175]	training's ndcg@5: 1	training's ndcg@10: 1
[176]	training's ndcg@5: 1	training's ndcg@10: 1
[177]	training's ndcg@5: 1	training's ndcg@10: 1
[178]	training's ndcg@5: 1	training's ndcg@10: 1
[179]	training's ndcg@5: 1	training's ndcg@10: 1
[180]	training's ndcg@5: 1	training's ndcg@10: 1
[181]	training's ndcg@5: 1	training's ndcg@10: 1
[182]	training's ndcg@5: 1	training's ndcg@10: 1
[183]	training's ndcg@5: 1	training's ndcg@10: 1
[184]	training's ndcg@5: 1	training's ndcg@10: 1
[185]	training's ndcg@5: 1	training's ndcg@10: 1
[186]	training's ndcg@5: 1	training's ndcg@10: 1
[187]	training's ndcg@5: 1	training's ndcg@10: 1
[188]	training's ndcg@5: 1	training's ndcg@10: 1
[189]	training's ndcg@5: 1	training's ndcg@10: 1
[190]	training's ndcg@5: 1	training's ndcg@10: 1
[191]	training's ndcg@5: 1	training's ndcg@10: 1
[192]	training's ndcg@5: 1	training's ndcg@10: 1
[193]	training's ndcg@5: 1	training's ndcg@10: 1
[194]	training's ndcg@5: 1	training's ndcg@10: 1
[195]	training's ndc

[343]	training's ndcg@5: 1	training's ndcg@10: 1
[344]	training's ndcg@5: 1	training's ndcg@10: 1
[345]	training's ndcg@5: 1	training's ndcg@10: 1
[346]	training's ndcg@5: 1	training's ndcg@10: 1
[347]	training's ndcg@5: 1	training's ndcg@10: 1
[348]	training's ndcg@5: 1	training's ndcg@10: 1
[349]	training's ndcg@5: 1	training's ndcg@10: 1
[350]	training's ndcg@5: 1	training's ndcg@10: 1
[351]	training's ndcg@5: 1	training's ndcg@10: 1
[352]	training's ndcg@5: 1	training's ndcg@10: 1
[353]	training's ndcg@5: 1	training's ndcg@10: 1
[354]	training's ndcg@5: 1	training's ndcg@10: 1
[355]	training's ndcg@5: 1	training's ndcg@10: 1
[356]	training's ndcg@5: 1	training's ndcg@10: 1
[357]	training's ndcg@5: 1	training's ndcg@10: 1
[358]	training's ndcg@5: 1	training's ndcg@10: 1
[359]	training's ndcg@5: 1	training's ndcg@10: 1
[360]	training's ndcg@5: 1	training's ndcg@10: 1
[361]	training's ndcg@5: 1	training's ndcg@10: 1
[362]	training's ndcg@5: 1	training's ndcg@10: 1
[363]	training's ndc

LGBMRanker(boosting_type='gbdt',
           categorical_column=['manufacturer', 'department', 'brand',
                               'commodity_desc', 'sub_commodity_desc',
                               'curr_size_of_product', 'age_desc',
                               'marital_status_code', 'income_desc',
                               'homeowner_desc', 'hh_comp_desc',
                               'household_size_desc', 'kid_category_desc'],
           class_weight=None, colsample_bytree=1.0, importance_type='split',
           is_unbalance=True, learning_rate=0.1, max_depth=4,
           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
           n_estimators=500, n_jobs=-1, num_leaves=31, objective='lambdarank',
           random_state=27, reg_alpha=0.0, reg_lambda=0.0, silent=False,
           subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
           verbose=1)

Сделаем предсказания на тренировочных данных

In [29]:
train_preds = lgb.predict(X_train)

In [30]:
train_preds

array([-0.50526121, -0.50526121, -0.50526121, ..., -0.51443757,
       -0.51443757, -0.51443757])

In [31]:
X_train["predicted_ranking"] = train_preds
X_train.sort_values("predicted_ranking", ascending=False)
X_train.head()

Unnamed: 0,user_id,item_id,quantity,sales_value,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,num,predicted_ranking
0,2070,1105426,0.0,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,-0.505261
1,2070,1097350,0.0,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,1,-0.505261
2,2070,879194,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2,-0.505261
3,2070,948640,0.0,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3,-0.505261
4,2070,928263,0.0,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,4,-0.505261


In [61]:
result_gbm = X_train.groupby('user_id')['item_id'].unique().reset_index()
result_gbm.columns = ['user_id', 'predicted_ranking']
result_gbm.head()

Unnamed: 0,user_id,predicted_ranking
0,1,"[856942, 9297615, 5577022, 877391, 9655212, 88..."
1,2,"[911974, 1076580, 1103898, 5567582, 1056620, 9..."
2,4,"[6391541, 1052294, 891423, 936470, 1137010, 11..."
3,6,"[13003092, 972416, 995598, 923600, 1138596, 10..."
4,7,"[998519, 894360, 7147142, 9338009, 896666, 939..."


Валидационные данные

In [63]:
data_val_lvl_2.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0
2277418,338,41260573635,636,5592737,2,1.58,369,-0.2,112,92,0.0,0.0
2277419,338,41260573635,636,7441679,1,3.69,369,0.0,112,92,0.0,0.0
2277420,338,41260573635,636,7442317,1,2.69,369,0.0,112,92,0.0,0.0


Сгруппируем данные

In [71]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns = ['user_id', 'actual']
result_lvl_2.head()

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [72]:
result_lvl_2['predicted_ranking'] = result_gbm['predicted_ranking']
result_lvl_2['precision'] = precision_at_k(list(result_gbm['predicted_ranking'])[0], list(result_lvl_2['actual'])[0], 5)
result_lvl_2.head()

Unnamed: 0,user_id,actual,predicted_ranking,precision
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 877391, 9655212, 88...",0.6
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[911974, 1076580, 1103898, 5567582, 1056620, 9...",0.6
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6391541, 1052294, 891423, 936470, 1137010, 11...",0.6
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[13003092, 972416, 995598, 923600, 1138596, 10...",0.6
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[998519, 894360, 7147142, 9338009, 896666, 939...",0.6


In [69]:
result_lvl_2.max()

user_id      2500.0
precision       0.6
dtype: float64

precision@5 составлят 0.6

In [None]:
result_lvl_2.to_csv('out.csv') recommendations.csv