In [170]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# #5 Ranking and Hybrid Recommender Systems

[Построение длинного и короткого портфеля акций с помощью
нового алгоритма listwise learn-to-ranking](https://arxiv.org/pdf/2104.12484.pdf)_(pdf)_

[Recommender system using Bayesian personalized ranking](https://towardsdatascience.com/recommender-system-using-bayesian-personalized-ranking-d30e98bba0b9)_(towardsdatascience)_

[Intro to WARP Loss, automatic differentiation and PyTorch](https://medium.com/@gabrieltseng/intro-to-warp-loss-automatic-differentiation-and-pytorch-b6aa5083187a)_(towardsdatascience)_

In [1]:
import os, sys
import pandas as pd
import numpy as np
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

from lightfm import LightFM
from lightfm.evaluation import precision_at_k


module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import prefilter_items
print('Done')

Done


In [3]:
data = pd.read_csv('../data/retail_train.csv')

item_features = pd.read_csv('../features_data/product.csv')
user_features = pd.read_csv('../features_data/hh_demographic.csv')

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
# prefilter data
print('{:=^79}'.format('Train Sample'))
data_train_filtered = prefilter_items(data_train, take_n_popular=5000, item_features=item_features)


# prepare CSR train data
user_item_matrix = pd.pivot_table(data_train_filtered, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', 
                                  aggfunc='count', 
                                  fill_value=0
                                 ).astype(float) 
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

# prepare CSR test data
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
test_user_item_matrix = test_user_item_matrix.astype(float)

== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
== Ending prefilter info ==
shape: (641574, 13)
# users: 2474
# items: 5000
Sparsity: 5.187%
[1mnew_columns:[0m {'price'}


In [6]:
# prepare dicts
userids = np.union1d(user_item_matrix.index.values, test_user_item_matrix.index.values)
itemids = np.union1d(user_item_matrix.columns.values, test_user_item_matrix.columns.values)

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [7]:
# prepare user/items features
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)


# encoding features
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

## *Grid Search with `hyperopt`*

In [182]:
# %pip install hyperopt

In [178]:
def evaluate_model(model, filtered_data, user_features, item_features) -> float:
    user_item_matrix = pd.pivot_table(
        filtered_data, 
        index='user_id', columns='item_id', 
        values='price',
        aggfunc='sum', 
        fill_value=0
    ).astype(float)
    sparse_user_item = csr_matrix(user_item_matrix).tocsr()

    user_feat = pd.DataFrame(user_item_matrix.index)
    user_feat = user_feat.merge(user_features, on='user_id', how='left')
    user_feat.set_index('user_id', inplace=True)
    item_feat = pd.DataFrame(user_item_matrix.columns)
    item_feat = item_feat.merge(item_features, on='item_id', how='left')
    item_feat.set_index('item_id', inplace=True)

    user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
    item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

    model.fit(
          (sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4,
          verbose=False
          )

    p = precision_at_k(
        model, 
        sparse_user_item, 
        user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
        item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
        k=5).mean()

    return p, model

In [179]:
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
from math import e

def hyperopt_obj(params, data):
    t = params['type']
    del params['type']
    if t == 'lightfm':
        clf = LightFM(**params)
    else:
        return 0
    
    return evaluate_model(model=clf, 
                          filtered_data=data,
                          user_features=user_features,
                          item_features=item_features)

In [1]:
space = hp.choice('task', 
    [
        {'type': 'lightfm',
         'loss': hp.choice('loss', ['warp', 'bpr']),
         'no_components': hp.choice('n_components', range(40, 150)),
         'learning_rate': hp.choice('learning_rate', [.005, .01, .1, .15,]),
         'max_sampled': hp.choice('max_sampled', range(5, 16)),
         'epsilon': hp.uniform('epsilon', 1e-6, 1e-2),
         'item_alpha': hp.uniform('item_alpha', 1e-3, 0.1),
         'user_alpha': hp.uniform('user_alpha', 1e-3, 0.1),
         'random_state': hp.choice('random_state', [111])}
    ])

count = 0
best = 0
def f(params):
    global best, count
    best_acc_delta = 10e3 # I want to get minimal train and test accuracy delta 
    count += 1
    acc_train, model = hyperopt_obj(params.copy(), data_train_filtered)
    acc_test = precision_at_k(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                k=5).mean()
    acc_delta = abs(acc_train - acc_test)
    if acc_delta < best_acc_delta:
        print('{:=^79}'.format(f'new best! train: {acc_train} test: {acc_test}'))
        print('using: {}'.format(model.get_params()))
        best_acc_delta = acc_delta
        best = acc_test

    return {'loss': -acc_test, 'status': STATUS_OK}


trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=200, trials=trials)
print(f'best:\n{best}')

# output in output_1.txt

In [183]:
best = {'loss': 'bpr', 
        'no_components': 75, 
        'learning_rate': 0.1,
        'epsilon': 0.00037744557464881477, 
        'max_sampled': 5, 
        'item_alpha': 0.010879605896560014, 
        'user_alpha': 0.03401434201826126, 
        'random_state': 0}

## *Init model*

In [184]:
model = LightFM(**best)

## *Fit Train*

In [185]:
%%time

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4,
          verbose=False) 

CPU times: user 3min 54s, sys: 408 ms, total: 3min 55s
Wall time: 1min


<lightfm.lightfm.LightFM at 0x7fbaf2a748e0>

## *Evaluation*

In [186]:
user_emb = model.get_user_representations(features=csr_matrix(user_feat_lightfm.values).tocsr())
item_emb = model.get_item_representations(features=csr_matrix(item_feat_lightfm.values).tocsr())

print('user_emb shapes:', user_emb[0].shape, user_emb[1].shape)
print('item_emb shapes:', item_emb[0].shape, item_emb[1].shape)

user_emb shapes: (2455,) (2455, 75)
item_emb shapes: (5000,) (5000, 75)


In [187]:
test_item_ids = np.array([1, 2, 3, 200, 1200, 4, 5])

predictions = model.predict(user_ids=0, item_ids=test_item_ids,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
predictions

array([-231.96638, -232.30417, -232.56096, -232.52869, -232.30084,
       -232.48209, -232.49571], dtype=float32)

In [188]:
print('{} new items'.format(len([i for i in test_user_item_matrix.columns if i not in user_item_matrix.columns])))
print('{} new users'.format(len([i for i in test_user_item_matrix.index if i not in user_item_matrix.index])))

1589 new items
18 new users


In [189]:
users_purchases = data_train_filtered.copy()
users_purchases = users_purchases.groupby('user_id').apply(lambda df: list(map(lambda x: int(x), df.item_id.unique()))).reset_index(name='purchases')
users_purchases['purchases_count'] = users_purchases['purchases'].apply(lambda x: len(x))
users_purchases

Unnamed: 0,user_id,purchases,purchases_count
0,1,"[912676, 945805, 958046, 977545, 1043064, 1055...",142
1,2,"[1077555, 846833, 1133018, 824005, 941183, 845...",132
2,3,"[878996, 882830, 904360, 951590, 964968, 10920...",182
3,4,"[877523, 883932, 901032, 904973, 938566, 96496...",38
4,5,"[938983, 866292, 925054, 946308, 1022011, 1071...",35
...,...,...,...
2450,2496,"[886703, 956609, 1105467, 1138132, 866227, 955...",242
2451,2497,"[1101956, 900802, 951590, 1065067, 1119089, 88...",235
2452,2498,"[1031697, 1049832, 908531, 985480, 1025457, 82...",140
2453,2499,"[853197, 903230, 821344, 822346, 846830, 85167...",214


In [190]:
users_purchases.query('user_id == {}'.format(1)).purchases

0    [912676, 945805, 958046, 977545, 1043064, 1055...
Name: purchases, dtype: object

In [191]:
user_features.query('user_id == 2455')

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
788,45-54,B,25-34K,Homeowner,2 Adults No Kids,2,None/Unknown,2455


In [192]:
# from lightfm.py
user_ids = np.repeat(np.int32(0), len(test_item_ids))
user_ids.max()+1, user_features.shape[0]

(1, 801)

In [193]:
[i for i in users_purchases.user_id if i not in user_feat_lightfm.index]

[]

In [2]:
test_item_ids = [i for i in range(100)] 
    # как и где лучше взять тестовые значения?

for user_id in users_purchases.user_id:
    print('USER: {}'.format(user_id))
    bought = users_purchases.query('user_id == {}'.format(user_id)).purchases.values.tolist()[0]
    print('bought: {}'.format(len(bought)))
    predictions = model.predict(user_ids=user_id, item_ids=test_item_ids,
                                user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                num_threads=4)
    print(predictions.argsort(axis=-1))
    
    
#     когда юзер = 2455, алгоритм ругается на отсутствие его в
#     user-features датасете, хотя он есть. 

#     Буду благодарна за комментарии по этому поводу

# output in output_2.txt

In [195]:
user_features.query('user_id == 2455')

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
788,45-54,B,25-34K,Homeowner,2 Adults No Kids,2,None/Unknown,2455


In [196]:
# from lightfm.py
user_ids = np.repeat(np.int32(0), len(test_item_ids))
user_ids.max()+1, user_features.shape[0]

(1, 801)

In [197]:
[i for i in users_purchases.user_id if i not in user_feat_lightfm.index]

[]

In [157]:
def print_bold_intersect(actual, preds):
    for pred in preds:
        if pred in actual:
            print('\033[1m'+str(pred), end=' ')
            continue
        print('\033[0m'+str(pred), end = ' ')
    print('\033[0m')

In [3]:
from src.metrics import precision_at_k, recall_at_k

N = 10
precisions = []
recalls = []

for user_id in users_purchases.user_id:
    bought = users_purchases.query('user_id == {}'.format(user_id)).purchases.values.tolist()[0]
    try:
        predictions = model.predict(user_ids=user_id, item_ids=test_item_ids,
                                user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                num_threads=4)
    except Exception as e:
        print('USER_ID: {}'.format(user_id))
        print(str(e))
    ranged_items_ids = [int(id_to_itemid[i]) for i in predictions.argsort(axis=-1)[:N]]

    p = precision_at_k(ranged_items_ids, bought)
    r = recall_at_k(ranged_items_ids, bought)
    if p*r != 0:
        print('USER_ID: {}\npredictions:'.format(user_id))
        print_bold_intersect(bought, ranged_items_ids)
        print('precision@k50:', p)
        print('recall@50:', r)
        
# output in output_3.txt

## *Train Precision*

In [125]:
%%time
from lightfm.evaluation import precision_at_k

train_precision = precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

train_precision 

CPU times: user 11.3 s, sys: 48.2 ms, total: 11.3 s
Wall time: 11.5 s


0.12668024

## *Test Precision*

In [127]:
test_precision = precision_at_k(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

test_precision 

0.0034650788

In [161]:
from lightfm.evaluation import recall_at_k, reciprocal_rank, auc_score

In [162]:
test_recall = recall_at_k(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

test_recall

0.0012809259397784205

In [163]:
test_rr = reciprocal_rank(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                ).mean()

test_rr

0.016455995

In [164]:
test_auc = auc_score(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 ).mean()

test_auc

0.4973105

очень плохие результаты