# Course project

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [5]:
# делим на 3 части DF. Всего 95 недель
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели (1-85 неделя)
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)] 

# берем данные для валидации matching модели (86-91 неделя)
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))] # 

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy() 

# берем данные для теста ranking, matching модели (92-95 неделя)
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

# сделаем объединенный сет данных для первого уровня (матчинга)  (1-91 неделя)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [6]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [7]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


# Prefilter items

In [8]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [9]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 12) Users: 1915 Items: 24042


In [10]:
result = data_train_matcher.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[825123, 999999, 845307, 852014, 856942, 99102..."
1,6,"[851819, 851903, 863447, 876232, 907099, 99079..."


# 1. Бейзлайны

In [24]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    popular=popular.loc[popular.item_id !=999999]
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [25]:
popular_recs = popularity_recommendation(data_train_matcher, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[825123, 999999, 845307, 852014, 856942, 99102...","[1029743, 916122, 5569230, 1106523, 844179]"
1,6,"[851819, 851903, 863447, 876232, 907099, 99079...","[1029743, 916122, 5569230, 1106523, 844179]"


In [26]:
print("Precision:",result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean())

Precision: 0.4375979112271543


In [39]:
# подгатавливаем данные для ALS

user_item_matrix = pd.pivot_table(data_train_matcher, index='user_id', columns='item_id', 
               values='quantity', 
               aggfunc='count', fill_value=0)

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()


userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [40]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in model.recommend(userid=userid_to_id[user],  # userid - id от 0 до N
                           user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                           N=N, # кол-во рекомендаций 
                           filter_already_liked_items=False, 
                           filter_items=[itemid_to_id[999999]], 
                           recalculate_user=False)]
    return res

In [41]:
from implicit.bpr import BayesianPersonalizedRanking
from implicit.als import AlternatingLeastSquares

In [52]:
from hyperopt import fmin, tpe, hp, Trials

# als
# best{'IT': 52, 'N': 196, 'regular': 0.03666397621050393}

space = [hp.randint('N', 90, 200),
         hp.uniform('regular', 0.001, 0.05),
         hp.randint('IT', 40, 80)]
mmm=0
def f(args):
    N, regular ,IT = args
    
    
    model = AlternatingLeastSquares(factors=N, 
                                regularization=regular,
                                iterations=IT, 
                                calculate_training_loss=True, 
                                num_threads=0)
    #model = BayesianPersonalizedRanking(factors=N, 
    #                            regularization=regular,
    #                            learning_rate=0.01,
    #                            iterations=IT, 
    #                            num_threads=4)

    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)
    mmm=+1
    result['BPR_test_'+str(mmm)] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=50))
    PRECISION=result.apply(lambda row: precision_at_k(row['BPR_test_'+str(mmm)], row['actual'], k=50), axis=1).mean()
    print(f'PRECISION= {PRECISION} and N, regular ,IT= {N, regular ,IT}')
    
    return 1-PRECISION

In [53]:
%%time

trials = Trials()

best = fmin(f, space, algo = tpe.suggest, max_evals=15, trials=trials)
print ('TPE result: ', best)

  0%|                                                                           | 0/15 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/60 [00:00<?, ?it/s]

PRECISION= 0.5907362924281995 and N, regular ,IT= (96, 0.008904567165385176, 60)                                       
  7%|███▏                                           | 1/15 [03:19<46:30, 199.30s/trial, best loss: 0.40926370757180053]

  0%|          | 0/52 [00:00<?, ?it/s]

PRECISION= 0.740731070496087 and N, regular ,IT= (196, 0.03666397621050393, 52)                                        
 13%|██████▌                                          | 2/15 [06:40<43:28, 200.66s/trial, best loss: 0.259268929503913]

  0%|          | 0/42 [00:00<?, ?it/s]

PRECISION= 0.5837284595300268 and N, regular ,IT= (92, 0.009397680415671059, 42)                                       
 20%|█████████▊                                       | 3/15 [10:06<40:34, 202.89s/trial, best loss: 0.259268929503913]

  0%|          | 0/68 [00:00<?, ?it/s]

PRECISION= 0.6400522193211499 and N, regular ,IT= (122, 0.03401067034373752, 68)                                       
 27%|█████████████                                    | 4/15 [13:31<37:20, 203.66s/trial, best loss: 0.259268929503913]

  0%|          | 0/70 [00:00<?, ?it/s]

PRECISION= 0.6390182767624041 and N, regular ,IT= (121, 0.04433674155745719, 70)                                       
 33%|████████████████▎                                | 5/15 [17:02<34:25, 206.52s/trial, best loss: 0.259268929503913]

  0%|          | 0/61 [00:00<?, ?it/s]

PRECISION= 0.6726370757180178 and N, regular ,IT= (142, 0.02559541285337843, 61)                                       
 40%|███████████████████▌                             | 6/15 [20:32<31:06, 207.42s/trial, best loss: 0.259268929503913]

  0%|          | 0/43 [00:00<?, ?it/s]

PRECISION= 0.6797702349869484 and N, regular ,IT= (147, 0.019024952260328812, 43)                                      
 47%|██████████████████████▊                          | 7/15 [23:56<27:30, 206.30s/trial, best loss: 0.259268929503913]

  0%|          | 0/67 [00:00<?, ?it/s]

PRECISION= 0.6998537859007862 and N, regular ,IT= (161, 0.02596147665623352, 67)                                       
 53%|██████████████████████████▏                      | 8/15 [27:25<24:11, 207.39s/trial, best loss: 0.259268929503913]

  0%|          | 0/62 [00:00<?, ?it/s]

PRECISION= 0.5955300261096612 and N, regular ,IT= (98, 0.030514864141171088, 62)                                       
 60%|█████████████████████████████▍                   | 9/15 [30:49<20:37, 206.21s/trial, best loss: 0.259268929503913]

  0%|          | 0/79 [00:00<?, ?it/s]

PRECISION= 0.6788720626631872 and N, regular ,IT= (146, 0.04794291105144384, 79)                                       
 67%|████████████████████████████████                | 10/15 [34:27<17:29, 209.99s/trial, best loss: 0.259268929503913]

  0%|          | 0/79 [00:00<?, ?it/s]

PRECISION= 0.6271122715404714 and N, regular ,IT= (115, 0.0362348903997725, 79)                                        
 73%|███████████████████████████████████▏            | 11/15 [38:05<14:09, 212.45s/trial, best loss: 0.259268929503913]

  0%|          | 0/46 [00:00<?, ?it/s]

PRECISION= 0.6736814621409943 and N, regular ,IT= (143, 0.020276670556757612, 46)                                      
 80%|██████████████████████████████████████▍         | 12/15 [41:39<10:38, 212.78s/trial, best loss: 0.259268929503913]

  0%|          | 0/41 [00:00<?, ?it/s]

PRECISION= 0.7013890339425619 and N, regular ,IT= (163, 0.015861336096657325, 41)                                      
 87%|█████████████████████████████████████████▌      | 13/15 [45:10<07:04, 212.17s/trial, best loss: 0.259268929503913]

  0%|          | 0/52 [00:00<?, ?it/s]

PRECISION= 0.7372741514360355 and N, regular ,IT= (193, 0.006069764127138768, 52)                                      
 93%|████████████████████████████████████████████▊   | 14/15 [48:41<03:31, 211.84s/trial, best loss: 0.259268929503913]

  0%|          | 0/77 [00:00<?, ?it/s]

PRECISION= 0.7377336814621455 and N, regular ,IT= (193, 0.011213130664924086, 77)                                      
100%|████████████████████████████████████████████████| 15/15 [52:14<00:00, 208.99s/trial, best loss: 0.259268929503913]
TPE result:  {'IT': 52, 'N': 196, 'regular': 0.03666397621050393}
Wall time: 52min 14s


In [42]:
# best {'IT': 47, 'N': 62}
model = BayesianPersonalizedRanking(factors=62, 
                                regularization=0.01,
                                learning_rate=0.01,
                                iterations=47, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/47 [00:00<?, ?it/s]

In [43]:
%%time
result['bpr_bm50'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=50))

print("Precision:",result.apply(lambda row: precision_at_k(row['bpr_bm50'], row['actual']), axis=1).mean())

Precision: 0.5755613577023487
Wall time: 3min 14s


In [56]:
#best{'IT': 52, 'N': 196, 'regular': 0.03666397621050393}

model = AlternatingLeastSquares(factors=196, 
                                regularization=0.0366,
                                iterations=52, 
                                calculate_training_loss=True, 
                                num_threads=0)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/52 [00:00<?, ?it/s]

In [57]:
%%time
result['als_bm50'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=50))

print("Precision:",result.apply(lambda row: precision_at_k(row['als_bm50'], row['actual']), axis=1).mean())

Precision: 0.971697127937336
Wall time: 3min 18s


In [58]:
result.head(5)

Unnamed: 0,user_id,actual,popular_recommendation,bpr_bm50,als_bm50,BPR_test_1
0,1,"[825123, 999999, 845307, 852014, 856942, 99102...","[1029743, 916122, 5569230, 1106523, 844179]","[912704, 852856, 1022097, 5568447, 908318, 110...","[1029743, 5569374, 940947, 10149640, 8090521, ...","[8090521, 1029743, 1004906, 10149640, 940947, ..."
1,6,"[851819, 851903, 863447, 876232, 907099, 99079...","[1029743, 916122, 5569230, 1106523, 844179]","[930118, 878996, 965267, 1023720, 1127831, 112...","[12301109, 1044078, 844179, 874972, 1070820, 8...","[874972, 12301109, 1044078, 844179, 1023720, 8..."
2,7,"[999999, 1020581, 1029743, 1040183, 1068504, 1...","[1029743, 916122, 5569230, 1106523, 844179]","[916122, 1029743, 897954, 866211, 985999, 1126...","[1122358, 12810393, 1106523, 893018, 866211, 8...","[1122358, 1106523, 12810393, 866211, 893018, 1..."
3,8,"[999999, 841220, 860501, 888543, 902094, 90813...","[1029743, 916122, 5569230, 1106523, 844179]","[844179, 916122, 5569230, 1004906, 892531, 823...","[12301109, 823704, 1029743, 1004906, 8090521, ...","[12301109, 823704, 844179, 1029743, 1081177, 1..."
4,9,"[882190, 949294, 999999, 1070820, 5568845, 556...","[1029743, 916122, 5569230, 1106523, 844179]","[1029743, 8090521, 1106523, 8090537, 5569230, ...","[1029743, 5569230, 1070820, 862799, 8090521, 8...","[1029743, 5569230, 1070820, 893018, 862799, 80..."


In [48]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [59]:
TOPK_PRECISION = 50
sorted(calc_precision(result, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('als_bm50', 0.7406788511749378),
 ('BPR_test_1', 0.7377336814621455),
 ('popular_recommendation', 0.4375979112271543),
 ('bpr_bm50', 0.2972114882506528)]

## Используем модуль recommender

In [60]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/4999 [00:00<?, ?it/s]

In [76]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [77]:
%%time

result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))
#result_eval_matcher['sim_user_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=N_PREDICT))

popular_recs = popularity_recommendation(data_val_matcher, n=50)
result_eval_matcher['best_rec']=result_eval_matcher[USER_COL].apply(lambda x: popular_recs )

Wall time: 57.7 s


In [78]:
result_eval_matcher['als_50']=result_eval_matcher['user_id'].apply(lambda x: get_recommendations(x, model=model, N=50))

In [79]:
# в кач-ве эксперемента возьмем объединение 2ух разных рекомендательных систем. 
# ТК нужно 50, то дополним объдинения популярными товарами.

top50 = popularity_recommendation(data_val_matcher, n=N_PREDICT)
data_list=[]
for i in range(len(result_eval_matcher)):
    intersect=list(np.intersect1d(result_eval_matcher['own_rec'][i], result_eval_matcher['als_rec'][i]))
    if len(intersect)<N_PREDICT:
        intersect+= top50[:(N_PREDICT-len(intersect))]
                            
    data_list.append(intersect)
result_eval_matcher.insert(2, 'own_als', data_list)
result_eval_matcher.head()

Unnamed: 0,user_id,actual,own_als,own_rec,sim_item_rec,als_rec,best_rec,als_50
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[825123, 835796, 849066, 856942, 877391, 91145...","[856942, 9297615, 5577022, 877391, 9655212, 10...","[1045586, 5582712, 9297615, 5577022, 1132231, ...","[965766, 871756, 5577022, 1013167, 856942, 929...","[1029743, 916122, 5569230, 1106523, 844179, 10...","[1029743, 5569374, 940947, 10149640, 8090521, ..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[819308, 900802, 995598, 996269, 1037337, 1078...","[13003092, 995598, 923600, 972416, 1084036, 11...","[948650, 1021324, 13002975, 941361, 5569230, 8...","[878996, 896613, 965267, 1105488, 996087, 9346...","[1029743, 916122, 5569230, 1106523, 844179, 10...","[12301109, 1044078, 844179, 874972, 1070820, 8..."
2,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[855350, 870882, 871513, 916575, 928932, 93968...","[998519, 894360, 7147142, 9338009, 896666, 939...","[1038985, 7147145, 982469, 6602697, 1014565, 9...","[12384775, 1122358, 7144132, 825994, 1028953, ...","[1029743, 916122, 5569230, 1106523, 844179, 10...","[1122358, 12810393, 1106523, 893018, 866211, 8..."
3,8,"[868075, 886787, 945611, 1005186, 1008787, 101...","[829722, 849018, 859259, 878636, 930385, 96277...","[12808385, 939860, 981660, 7410201, 5577022, 6...","[1021324, 1110111, 982469, 7155863, 6463649, 1...","[823704, 1131438, 844179, 12172240, 1132198, 7...","[1029743, 916122, 5569230, 1106523, 844179, 10...","[12301109, 823704, 1029743, 1004906, 8090521, ..."
4,9,"[883616, 1029743, 1039126, 1051323, 1082772, 1...","[851683, 862799, 872146, 872762, 882190, 89301...","[872146, 918046, 9655676, 985622, 1056005, 109...","[1056005, 5569230, 9526278, 1026946, 862799, 7...","[1056005, 862799, 1128244, 872146, 5568845, 55...","[1029743, 916122, 5569230, 1106523, 844179, 10...","[1029743, 5569230, 1070820, 862799, 8090521, 8..."


In [84]:
TOPK_PRECISION = 50
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_als', 0.11592689295039169),
 ('als_50', 0.10597389033942596),
 ('als_rec', 0.0959373368146218),
 ('best_rec', 0.09264751958224568),
 ('own_rec', 0.07660574412532668),
 ('sim_item_rec', 0.04629765013054847)]

In [85]:
TOPK_PRECISION = 5
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.18872062663185182),
 ('als_50', 0.16605744125326158),
 ('als_rec', 0.16187989556135587),
 ('own_als', 0.1362924281984325),
 ('best_rec', 0.11394255874673555),
 ('sim_item_rec', 0.07383812010443891)]

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [86]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [88]:
df_match_candidates=df_match_candidates.merge(result_eval_matcher[['user_id', 'own_als']], how='left', on='user_id')
df_match_candidates.rename(columns={'own_als': 'candidates'}, inplace=True)

In [91]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[829955, 837865, 917033, 926905, 935770, 94458..."
1,2021,"[835578, 845208, 858373, 863762, 870515, 88393..."


In [92]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,829955
0,2070,837865
0,2070,917033
0,2070,926905


In [95]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (95750, 2) Users: 1915 Items: 4450


In [96]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [97]:
df_ranker_train.target.value_counts()

0.0    84565
1.0    18949
Name: target, dtype: int64

In [98]:
df_ranker_train.head(9)

Unnamed: 0,user_id,item_id,target
0,2070,829955,0.0
1,2070,837865,0.0
2,2070,917033,0.0
3,2070,926905,0.0
4,2070,935770,0.0
5,2070,944588,0.0
6,2070,970866,0.0
7,2070,1008814,0.0
8,2070,1016800,0.0


In [99]:
df_ranker_train['target'].mean()

0.18305736422126476

In [100]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,829955,0.0,1094,MEAT-PCKGD,National,LUNCHMEAT,LOAVE,8 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,837865,0.0,781,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,1 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [101]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


# Добавляем Фичи

In [102]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [103]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2070,829955,0.0,1094,MEAT-PCKGD,National,LUNCHMEAT,LOAVE,8 OZ,45-54,...,57,56,1996,5754.86,0.626374,1218.32967,0.000232,0.452137,0.000228,0.00814
1,2070,837865,0.0,781,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,1 LB,45-54,...,213,155,1996,5754.86,2.340659,1218.32967,0.000869,0.452137,0.000632,0.00814
2,2070,917033,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,481,267,1996,5754.86,5.285714,1218.32967,0.001962,0.452137,0.001089,0.00814
3,2070,926905,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,786,553,1996,5754.86,8.637363,1218.32967,0.003205,0.452137,0.002255,0.00814
4,2070,935770,0.0,69,GROCERY,Private,CHEESE,CHEESE CRACKERS (CHEEZ-ITS/GOL,8 OZ,45-54,...,53,47,1996,5754.86,0.582418,1218.32967,0.000216,0.452137,0.000192,0.00814


In [104]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [105]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'total_quantity_value',
 'item_freq',
 'user_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'user_quantity_per_week',
 'item_quantity_per_basket',
 'user_quantity_per_baskter',
 'item_freq_per_basket',
 'user_freq_per_basket']

In [106]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103514 entries, 0 to 103513
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   user_id                    103514 non-null  int64   
 1   item_id                    103514 non-null  int64   
 2   manufacturer               103514 non-null  category
 3   department                 103514 non-null  category
 4   brand                      103514 non-null  category
 5   commodity_desc             103514 non-null  category
 6   sub_commodity_desc         103514 non-null  category
 7   curr_size_of_product       103514 non-null  category
 8   age_desc                   43085 non-null   category
 9   marital_status_code        43085 non-null   category
 10  income_desc                43085 non-null   category
 11  homeowner_desc             43085 non-null   category
 12  hh_comp_desc               43085 non-null   category
 13  household_size

In [270]:
#import xgboost as xgb
#from xgboost.sklearn import XGBClassifier,XGBRegressor
#from catboost import CatBoostClassifier
#from sklearn.ensemble import RandomForestClassifier

In [107]:
# {'D': 24, 'N': 743, 'rate': 0.19655305158077582} LGBM 
# PRECISION= 0.15488250652741367, n,d,rate=(1145, 47, 0.18924870049845754)   BEST
# {'D': 34, 'N': 953, 'rate': 0.21277255242925144}LGBM1
# {'D': 39, 'N': 14, 'rate': 0.4309099723394805} CatBoostClassifier
# {'criterion': 1, 'max_depth': 18, 'max_features': 527, 'n_estimators': 12}  RandomForestClassifier
# PRECISION:0.15759791122715236, criterion: entropy, max_depth: 26, max_features: 901, n_estimators: 29 
#{'D': 47, 'N': 1145, 'rate': 0.18924870049845754}
#n,d,rate=(714, 41, 0.17261300008166008) 

#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.model_selection import cross_val_score

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(lgb, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

#lgb = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)

#lgb = CatBoostClassifier(iterations=39, learning_rate=0.4309, depth=14, cat_features=cat_feats[:13])
#lgb=RandomForestClassifier(max_depth=26, max_features=901, n_estimators=12, criterion='entropy')

#lgb = xgb.XGBClassifier(tree_method="exact", enable_categorical=True, use_label_encoder=False)

  return f(*args, **kwargs)


In [None]:
# После проверки различных вариантов, алгоритм LGBMClassifier показал лучшие рез-ты
lgb = LGBMClassifier(objective='binary',
                     max_depth=47,
                     n_estimators=1145,
                     learning_rate=0.18924,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)
train_preds = lgb.predict_proba(X_train)

In [108]:
df_ranker_predict = df_ranker_train.copy()

In [109]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [110]:
df_ranker_predict.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,proba_item_purchase
0,2070,829955,0.0,1094,MEAT-PCKGD,National,LUNCHMEAT,LOAVE,8 OZ,45-54,...,56,1996,5754.86,0.626374,1218.32967,0.000232,0.452137,0.000228,0.00814,0.002112
1,2070,837865,0.0,781,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,1 LB,45-54,...,155,1996,5754.86,2.340659,1218.32967,0.000869,0.452137,0.000632,0.00814,0.003477
2,2070,917033,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,267,1996,5754.86,5.285714,1218.32967,0.001962,0.452137,0.001089,0.00814,0.002809
3,2070,926905,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,553,1996,5754.86,8.637363,1218.32967,0.003205,0.452137,0.002255,0.00814,0.003706
4,2070,935770,0.0,69,GROCERY,Private,CHEESE,CHEESE CRACKERS (CHEEZ-ITS/GOL,8 OZ,45-54,...,47,1996,5754.86,0.582418,1218.32967,0.000216,0.452137,0.000192,0.00814,0.000316
5,2070,944588,0.0,1094,MEAT-PCKGD,National,LUNCHMEAT,HAM,12 OZ,45-54,...,67,1996,5754.86,0.824176,1218.32967,0.000306,0.452137,0.000273,0.00814,0.002477
6,2070,970866,0.0,5612,GROCERY,National,SUGARS/SWEETNERS,SWEETENERS,9.7 OZ,45-54,...,152,1996,5754.86,1.736264,1218.32967,0.000644,0.452137,0.00062,0.00814,0.006879
7,2070,1008814,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,150,1996,5754.86,2.362637,1218.32967,0.000877,0.452137,0.000612,0.00814,0.000345
8,2070,1016800,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,450,1996,5754.86,8.186813,1218.32967,0.003038,0.452137,0.001835,0.00814,0.00495
9,2070,1053016,0.0,69,GROCERY,Private,CHEESE,NATURAL CHEESE EXACT WT CHUNKS,24 OZ,45-54,...,154,1996,5754.86,1.791209,1218.32967,0.000665,0.452137,0.000628,0.00814,0.001326


## След 2 блоки использовались для поиска оптимальных пар-ов!

In [127]:
# подборка пар-ов

from hyperopt import fmin, tpe, hp, Trials
from sklearn.ensemble import RandomForestClassifier

#LGBMClassifier
space = [hp.randint('N', 300, 1000), 
        hp.randint('D', 10, 35),
        hp.uniform('rate', 0.10, 0.3)]

#CatBoostClassifier
#space = [hp.randint('N', 5, 15), 
#         hp.randint('D', 10, 50),
#         hp.uniform('rate', 0.3, 0.9)]

#RandomForestClassifier
#space = [hp.choice('max_depth', range(15,35)),
#        hp.choice('max_features', range(500,2000)),
#        hp.choice('n_estimators', range(10,30)),
#        hp.choice('criterion', ["gini", "entropy"])]

mmm=0
def f(args):    
    N, D,rate = args
    #max_depth, max_features, n_estimators, criterion = args
    
    
    lgb = LGBMClassifier(objective='binary',
                     max_depth=D,
                     n_estimators=N,
                     learning_rate=rate,
                     categorical_column=cat_feats)
    
    #lgb = CatBoostClassifier(iterations=D, learning_rate=rate, depth=N, cat_features=cat_feats[:13])
    #lgb=RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, criterion=criterion)
    
    lgb.fit(X_train, y_train)
    
    mmm=+1
    train_preds = lgb.predict_proba(X_train)
    df_ranker_predict = df_ranker_train.copy()
    df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
    
    result_eval_ranker['reranked_own_rec_'+str(mmm)] = result_eval_ranker[USER_COL].apply(lambda user_id: df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist())
    
    
    #result['BPR_test_'+str(mmm)] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))
    
    PRECISION=result_eval_ranker.apply(lambda row: precision_at_k(row['reranked_own_rec_'+str(mmm)], row['actual']), axis=1).mean()
    
    print(f'PRECISION= {PRECISION}, n,d,rate={N, D, rate}')
    #print(f'PRECISION= {PRECISION} and for index 1 prec= {result["BPR_test_"+str(mmm)].iloc[1]}')
    #print(f'PRECISION:{PRECISION}, criterion: {criterion}, max_depth: {max_depth}, max_features: {max_features}, n_estimators: {n_estimators}' )
    #print(f'N={N}, trees={depth}, l_rate={l_rate}, pred={train_preds[:,1]}')
    
    return 1-PRECISION

In [334]:
%%time

trials = Trials()

best = fmin(f, space, algo = tpe.suggest, max_evals=20, trials=trials)
print ('TPE result: ', best)

  0%|                                                                           | 0/20 [00:00<?, ?trial/s, best loss=?]

  return f(*args, **kwargs)




PRECISION= 0.15195822454307958, n,d,rate=(1243, 57, 0.1819723830161179)                                                
  5%|██▍                                              | 1/20 [00:07<02:20,  7.37s/trial, best loss: 0.8480417754569204]

  return f(*args, **kwargs)




PRECISION= 0.15154046997388898, n,d,rate=(1090, 35, 0.2248330249893687)                                                
 10%|████▉                                            | 2/20 [00:13<02:03,  6.87s/trial, best loss: 0.8480417754569204]

  return f(*args, **kwargs)




PRECISION= 0.14945169712793582, n,d,rate=(1016, 50, 0.19738283088688202)                                               
 15%|███████▎                                         | 3/20 [00:21<02:03,  7.26s/trial, best loss: 0.8480417754569204]

  return f(*args, **kwargs)




PRECISION= 0.15164490861618657, n,d,rate=(1263, 49, 0.2485408073878282)                                                
 20%|█████████▊                                       | 4/20 [00:29<01:59,  7.45s/trial, best loss: 0.8480417754569204]

  return f(*args, **kwargs)




PRECISION= 0.15373368146213945, n,d,rate=(714, 41, 0.17261300008166008)                                                
 25%|████████████▎                                    | 5/20 [00:34<01:37,  6.53s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15227154046997252, n,d,rate=(1024, 63, 0.17468955870880415)                                               
 30%|██████████████▋                                  | 6/20 [00:41<01:33,  6.67s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15133159268929353, n,d,rate=(1156, 59, 0.23230380169805948)                                               
 35%|█████████████████▏                               | 7/20 [00:49<01:32,  7.08s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15321148825065128, n,d,rate=(888, 44, 0.26191130812960717)                                                
 40%|███████████████████▌                             | 8/20 [00:54<01:19,  6.65s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.14945169712793605, n,d,rate=(1417, 65, 0.17006072729094415)                                               
 45%|██████████████████████                           | 9/20 [01:03<01:19,  7.26s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.149138381201043, n,d,rate=(1018, 62, 0.2767217319596358)                                                  
 50%|████████████████████████                        | 10/20 [01:09<01:10,  7.02s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15174934725848416, n,d,rate=(1206, 51, 0.2598872038277134)                                                
 55%|██████████████████████████▍                     | 11/20 [01:18<01:07,  7.47s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15070496083550766, n,d,rate=(1354, 60, 0.22476041955341816)                                               
 60%|████████████████████████████▊                   | 12/20 [01:26<01:02,  7.75s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15060052219321, n,d,rate=(1277, 60, 0.2164551781244452)                                                   
 65%|███████████████████████████████▏                | 13/20 [01:34<00:54,  7.78s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15018276762401947, n,d,rate=(1185, 66, 0.2171563502042413)                                                
 70%|█████████████████████████████████▌              | 14/20 [01:43<00:47,  7.96s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15268929503916276, n,d,rate=(1086, 44, 0.1625124039967712)                                                
 75%|████████████████████████████████████            | 15/20 [01:49<00:37,  7.54s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15174934725848405, n,d,rate=(775, 69, 0.2552466165840268)                                                 
 80%|██████████████████████████████████████▍         | 16/20 [01:54<00:27,  6.79s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15185378590078175, n,d,rate=(1412, 59, 0.17605453600834783)                                               
 85%|████████████████████████████████████████▊       | 17/20 [02:04<00:23,  7.72s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15216710182767457, n,d,rate=(1447, 45, 0.21660294887119524)                                               
 90%|███████████████████████████████████████████▏    | 18/20 [02:12<00:15,  7.89s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15289817232375844, n,d,rate=(1167, 58, 0.17094544812765167)                                               
 95%|█████████████████████████████████████████████▌  | 19/20 [02:20<00:07,  7.70s/trial, best loss: 0.8462663185378605]

  return f(*args, **kwargs)




PRECISION= 0.15488250652741367, n,d,rate=(1145, 47, 0.18924870049845754)                                               
100%|████████████████████████████████████████████████| 20/20 [02:27<00:00,  7.35s/trial, best loss: 0.8451174934725864]
TPE result:  {'D': 47, 'N': 1145, 'rate': 0.18924870049845754}
Wall time: 2min 27s


In [111]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [113]:
result_eval_ranker=result_eval_ranker.merge(result_eval_matcher[['user_id', 'own_als']], how='left', on='user_id')
result_eval_ranker

Unnamed: 0,user_id,actual,own_als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[825123, 835796, 849066, 856942, 877391, 91145..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[819308, 900802, 995598, 996269, 1037337, 1078..."
2,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[855350, 870882, 871513, 916575, 928932, 93968..."
3,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[829722, 849018, 859259, 878636, 930385, 96277..."
4,9,"[864335, 990865, 1029743, 9297474, 10457112, 8...","[851683, 862799, 872146, 872762, 882190, 89301..."
...,...,...,...
1910,2496,[6534178],"[829291, 991546, 995876, 1056509, 1077231, 559..."
1911,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[820612, 849202, 870515, 1000462, 1004436, 100..."
1912,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[891632, 892048, 924864, 925364, 931579, 93335..."
1913,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[835819, 864857, 869322, 889989, 919766, 94179..."


In [114]:
# Используем только товары, вероятность которых болле 50%. Если их менее 5ти, то добавляем из списка рекомендованные. 

populal_rec=popularity_recommendation(data, n=5)
def rerank(user_id):
    df_=df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False)
    df_=list(set(df_.loc[df_.proba_item_purchase >0.5].item_id.tolist()))
    if len(df_)<5:
        df_=df_+populal_rec[:(5-len(df_))]
    return df_[:5]

In [115]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [116]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.29681462140992104)
('own_als', 0.09827676240208855)


In [128]:
#precision= 0.2968, что больше нужного пар-ра 0.25

# Оценка на тесте для выполнения курсового проекта

In [118]:
df_test = pd.read_csv('retail_test1.csv')
df_transactions = pd.read_csv('retail_train.csv')

In [119]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [120]:
df_transactions.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [121]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [122]:
# ищем общих пользователей
common_users1= list(set(df_test.user_id.values)&set(common_users))

# оставляем общих пользователей
result_test = result_test[result_test.user_id.isin(common_users1)]

In [123]:
result_test

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663..."
5,8,"[846334, 850834, 857503, 862139, 865891, 87829..."
6,9,"[883404, 995242, 1056005, 889692, 911140, 918046]"
...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537..."
1881,2497,[6534178]
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323..."


In [124]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [125]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.27047504509921566)


In [129]:
#что так же больше 0.25