#Курсовой проект

##Построить модель и проверить её эффективность на данных из файла 'retail_test1.csv' #датасет из baseline https://github.com/geangohn/recsys-tutorial

In [1]:
!pip install implicit



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#from sklearn.preprocessing import OneHotEncoder

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix
import implicit

# Матричная факторизация
from implicit import als

# Модель второго уровня
#from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender



In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель --
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
def get_result_table(data: pd.DataFrame):
    result = data.groupby('user_id')['item_id'].unique().reset_index()
    result.columns = ['user_id', 'actual']
    return result

result_lvl_1 = get_result_table(data_val_lvl_1)
result_lvl_2 = get_result_table(data_val_lvl_2)
display(result_lvl_1.head(2), result_lvl_2.head(2))

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1, top_n_popular = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 10086


In [6]:
common_users = data_train_lvl_1.user_id.values

data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

In [7]:
recommender = MainRecommender(data_train_lvl_1, top_n_popular)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [8]:
model_recs = recommender.get_model_recommendation()

In [9]:
similar_items_recs = recommender.get_similar_items_recommendation()

In [10]:
recall_at_k_dict = {}

k_list = [100, 500, 1000]

for k in k_list:
    comp_name = 'model_rec_' + str(k)

    model_recs = \
        recommender.get_model_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(model_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'model_rec': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [11]:
recall_at_k_dict

{'model_rec_100': 0.1028343416002825,
 'model_rec_500': 0.23340826484704907,
 'model_rec_1000': 0.31422401196781335}

In [12]:
for k in k_list:
    comp_name = 'similar_recommendation_' + str(k)

    similar_items_recs = \
        recommender.get_similar_items_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(similar_items_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'similar_recommendation': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [13]:
recall_at_k_dict

{'model_rec_100': 0.1028343416002825,
 'model_rec_500': 0.23340826484704907,
 'model_rec_1000': 0.31422401196781335,
 'similar_recommendation_100': 0.09544208353508009,
 'similar_recommendation_500': 0.22289857342646854,
 'similar_recommendation_1000': 0.2925944059483107}

##На первом уровне лучший результат показала модель ALS при к = 1000  с параметрами n_factors=50, regularization=0.001, iterations=15, num_threads=4

##Обучаем модель 2-ого уровня

In [14]:
#формируем датафрэйм для 2-ого уровня
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

In [15]:
users_lvl_2 = users_lvl_2.merge(recommender.get_model_recommendation(N=1000),
                                on='user_id',
                                how='inner')

users_lvl_2.columns = ['user_id', 'candidates']
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1107553, 1085604, 879755, 923746, 883404, 908..."
1,2021,"[871756, 951590, 12731544, 1131344, 896938, 99..."


In [16]:
#Разворачиваем кандидатов, чтобы в каждой строке был только один:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1107553,1
0,2070,1085604,1
0,2070,879755,1
0,2070,923746,1


In [17]:
data_train_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [18]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id', 'quantity', 'sales_value']].copy()
targets_lvl_2['target'] = 1  # тут только покупки

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target
0,2070,1107553,,,0.0
1,2070,1085604,1.0,1.11,1.0


In [19]:
item_features.info(), user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92353 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92353 non-null  object
 5   sub_commodity_desc    92353 non-null  object
 6   curr_size_of_product  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   age_desc             801 non-null    object
 1   marital_status_code  801 non-null    object
 2   income_desc          801 non-null    object
 3   homeowner_desc       801 non-null    o

(None, None)

In [20]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152669 entries, 0 to 2152668
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   quantity     float64
 3   sales_value  float64
 4   target       float64
dtypes: float64(3), int64(2)
memory usage: 98.5 MB


In [21]:
## Подготавливаем фичи для обучения модели
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1107553,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1085604,1.0,1.11,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [22]:
targets_lvl_2['quantity'].fillna(targets_lvl_2['quantity'].median(),
                                 inplace=True)
targets_lvl_2['sales_value'].fillna(targets_lvl_2['sales_value'].mean(),
                                    inplace=True)

targets_lvl_2.info(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152669 entries, 0 to 2152668
Data columns (total 18 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int64  
 1   item_id               int64  
 2   quantity              float64
 3   sales_value           float64
 4   target                float64
 5   manufacturer          int64  
 6   department            object 
 7   brand                 object 
 8   commodity_desc        object 
 9   sub_commodity_desc    object 
 10  curr_size_of_product  object 
 11  age_desc              object 
 12  marital_status_code   object 
 13  income_desc           object 
 14  homeowner_desc        object 
 15  hh_comp_desc          object 
 16  household_size_desc   object 
 17  kid_category_desc     object 
dtypes: float64(3), int64(3), object(12)
memory usage: 312.0+ MB


In [23]:
#Топ категорий товаров, которые купил пользователь.
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0)

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_department']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [24]:
#Топ брендов, которые купил пользователь.
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='brand',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0)

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_brand']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [25]:
# # Средняя сумма чека пользователя в каждой категории товаров (department)
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='sales_value',
                    aggfunc='mean',
                    fill_value=0
                    )

df = df.stack().reset_index()
df.columns = ['user_id', 'department', 'mean_sales_value_of_user_in_department']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on=['user_id', 'department'],
                                    how='inner')


In [26]:
targets_lvl_2['age_desc'].unique()

array(['45-54', nan, '35-44', '55-64', '25-34', '65+', '19-24'],
      dtype=object)

In [27]:
df = \
    targets_lvl_2.groupby(by=['user_id'])['age_desc']\
    .apply(lambda x: pd.Series.mode(x))
df = df.reset_index()
df.drop(columns='level_1',
        inplace=True)

df.columns=['user_id', 'age_desc_corrected']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [28]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,top_department,top_brand,mean_sales_value_of_user_in_department,age_desc_corrected
0,2070,1107553,1.0,2.425202,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,...,U,50-74K,Unknown,Unknown,1,None/Unknown,GROCERY,National,2.402958,45-54
1,2070,1085604,1.0,1.11,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,...,U,50-74K,Unknown,Unknown,1,None/Unknown,GROCERY,National,2.402958,45-54


In [29]:
targets_lvl_2.columns

Index(['user_id', 'item_id', 'quantity', 'sales_value', 'target',
       'manufacturer', 'department', 'brand', 'commodity_desc',
       'sub_commodity_desc', 'curr_size_of_product', 'age_desc',
       'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'kid_category_desc', 'top_department',
       'top_brand', 'mean_sales_value_of_user_in_department',
       'age_desc_corrected'],
      dtype='object')

In [30]:
feature_columns = \
    ['user_id',
     'item_id',
     'quantity',
     'sales_value',
     'department',
     'manufacturer',
     'age_desc_corrected',
     'brand',
     'top_department',
     'top_brand',
     'mean_sales_value_of_user_in_department'
    ]

In [31]:
targets_lvl_2[feature_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 798096 entries, 0 to 798095
Data columns (total 11 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   user_id                                 798096 non-null  int64  
 1   item_id                                 798096 non-null  int64  
 2   quantity                                798096 non-null  float64
 3   sales_value                             798096 non-null  float64
 4   department                              798096 non-null  object 
 5   manufacturer                            798096 non-null  int64  
 6   age_desc_corrected                      798096 non-null  object 
 7   brand                                   798096 non-null  object 
 8   top_department                          798096 non-null  object 
 9   top_brand                               798096 non-null  object 
 10  mean_sales_value_of_user_in_department  7980

In [32]:
X_train = targets_lvl_2[feature_columns]
y_train = targets_lvl_2['target']

X_train.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,department,manufacturer,age_desc_corrected,brand,top_department,top_brand,mean_sales_value_of_user_in_department
0,2070,1107553,1.0,2.425202,GROCERY,103,45-54,National,GROCERY,National,2.402958
1,2070,1085604,1.0,1.11,GROCERY,103,45-54,National,GROCERY,National,2.402958


In [33]:
cat_feats = ['user_id', 'item_id', 'manufacturer',
              'age_desc_corrected', 'department',
             'brand', 'top_department','top_brand']

X_train[cat_feats] = X_train[cat_feats].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[cat_feats] = X_train[cat_feats].astype('category')


In [34]:
X_train.isna().sum()

user_id                                   0
item_id                                   0
quantity                                  0
sales_value                               0
department                                0
manufacturer                              0
age_desc_corrected                        0
brand                                     0
top_department                            0
top_brand                                 0
mean_sales_value_of_user_in_department    0
dtype: int64

In [35]:
len(X_train)

798096

In [36]:
!pip install catboost



In [37]:
from catboost import CatBoostClassifier

In [38]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns = ['user_id', 'actual']
result_lvl_2.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [39]:
def map_at_k(recommended_list, bought_list, k=5):
    flags = np.isin(np.array(recommended_list), np.array(bought_list))
    if sum(flags) == 0:
        return 0
    sum_ = 0
    for i in range(min(k, len(flags))):
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
    return sum_ / k


In [None]:
table_metric = {'iterations':[], 'learning_rate':[], 'depth':[],'Precision@5':[], 'MAP@5':[]}

for iterations_ in [50, 100]:
  for learning_rate_ in [0.01, 0.05]:
    for depth_ in [10, 15]:
      model_ = CatBoostClassifier(
        random_seed=25,
        iterations=iterations_,
        learning_rate=learning_rate_,
        depth=depth_)

      model_.fit(X_train, y_train,
              cat_features=cat_feats,
              verbose=50)
      train_preds_ = model_.predict(X_train)
      train_preds_ = train_preds_.astype(bool)

      rec_items_ = X_train[train_preds_].groupby(by=['user_id'])['item_id'].unique().reset_index()
      rec_items_.columns = ['user_id', f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}']

      rec_items_[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'] = \
      rec_items_[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'].apply(lambda x: x[:5] if len(x) >= 5 else x)

      result_lvl_2 = result_lvl_2.merge(rec_items_,
                                   on='user_id',
                                   how='inner')

      test_presicion_ = result_lvl_2.apply(lambda row: precision_at_k(row[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'], row['actual']), axis=1).mean()
      test_map = result_lvl_2.apply(lambda row:  map_at_k(row[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'], row['actual']), axis=1).mean()

      table_metric['iterations'].append(iterations_)
      table_metric['learning_rate'].append(learning_rate_)
      table_metric['depth'].append(depth_)
      table_metric['Precision@5'].append(test_presicion_)
      table_metric['MAP@5'].append(test_map)


In [42]:
table_metric = pd.DataFrame(table_metric)
table_metric.sort_values(by=['Precision@5'], ascending=False)

Unnamed: 0,iterations,learning_rate,depth,Precision@5,MAP@5
2,50,0.05,10,0.328806,0.23568
3,50,0.05,15,0.328806,0.23568
5,100,0.01,15,0.328806,0.235912
6,100,0.05,10,0.328806,0.23568
7,100,0.05,15,0.328806,0.23568
0,50,0.01,10,0.32854,0.235649
1,50,0.01,15,0.32854,0.235649
4,100,0.01,10,0.32854,0.235649


###На втором уровне наиболее оптимальными параметрами модели CatBoostClassifier являются: iterations = 50, learning_rate = 0,01 и depth = 10. Метрика Precision@5 = 0,328806.  Её и оставим для финального прогноза.

##Сделаем прогноз на основании нашей модели и посчитаем метрику для тестовых данных, которые не участвовали в обучении. Из файла retail_test1.

In [64]:
model_pred = CatBoostClassifier(
        random_seed=25,
        iterations=50,
        learning_rate=0.05,
        depth=10)

model_pred.fit(X_train, y_train,
              cat_features=cat_feats,
              verbose=50)
train_preds = model_pred.predict(X_train)
train_preds = train_preds_.astype(bool)

pred_items = X_train[train_preds_].groupby(by=['user_id'])['item_id'].unique().reset_index()
pred_items.columns = ['user_id', 'final_recommendations']

pred_items['final_recommendations'] = \
      pred_items['final_recommendations'].apply(lambda x: x[:10] if len(x) >= 10 else x)


0:	learn: 0.6445250	total: 550ms	remaining: 27s
49:	learn: 0.0415015	total: 23.1s	remaining: 0us


In [74]:
#получили предсказания
pred_items.head(3)

Unnamed: 0,user_id,final_recommendations
0,1,"[856942, 1025641, 958046, 5577022, 1075074, 96..."
1,7,"[1003188, 1052046, 1126899, 1072483, 1031833, ..."
2,8,"[840361, 940631, 1127624, 8181377, 917381, 967..."


In [44]:
test = pd.read_csv('retail_test1.csv') #датасет из baseline https://github.com/geangohn/recsys-tutorial
test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


In [106]:
users_lvl_3 = test.user_id.unique()
final_result = test.groupby('user_id')['item_id'].unique().reset_index()
final_result.columns=['user_id', 'test_actual']

In [99]:
len(pred_items), len(users_lvl_3)

(790, 1885)

###В тестовом датасете количество пользователей значительно больше, чем в предсказаниях. Это может значительно снизить итоговую метрику. Для них по умолчанию будем рекомендовать самые популярные товары.

In [107]:
 #добавим предсказания к тестовой таблице
 final_result = final_result.merge(pred_items,
                                   on='user_id',
                                   how='left')

In [108]:
#заполним предсказания по "холодным" пользователям первыми десятью из топ-продуктов

overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
overall_top_purchases = overall_top_purchases[overall_top_purchases['item_id'] != 999999]
overall_top_purchases = overall_top_purchases.item_id.tolist()

final_result['final_recommendations'] = final_result['final_recommendations'].apply(lambda x: overall_top_purchases[:10] if isinstance(x, float) else x)

In [109]:
final_result

Unnamed: 0,user_id,test_actual,final_recommendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 1025641, 958046, 5577022, 1075074, 96..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[1003188, 1052046, 1126899, 1072483, 1031833, ..."
...,...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537...","[899624, 995785, 979707, 855672, 995965, 82624..."
1881,2497,[6534178],"[1038217, 897125, 1066685, 896938, 884896, 965..."
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1...","[914190, 940766, 1053690, 951197, 1100379, 918..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323...","[826249, 883404, 5569327, 5568378, 830887, 999..."


In [None]:
#посчитаем итоговую метрику
result_presicion = final_result.apply(lambda row: precision_at_k(row['final_recommendations'], row['test_actual']), axis=1).mean()


In [111]:
result_presicion

0.20012437810945274

In [None]:
#Рассчитаем метрику только по пользователям, для которых делали предсказания.

In [None]:
final_result = test.groupby('user_id')['item_id'].unique().reset_index()
final_result.columns=['user_id', 'test_actual']

final_result = final_result.merge(pred_items,
                                   on='user_id',
                                   how='inner')

result_presicion = final_result.apply(lambda row: precision_at_k(row['final_recommendations'], row['test_actual']), axis=1).mean()



In [114]:
result_presicion

0.2716870194482135

In [None]:
#неплохой результат)