# #5 Ranking and Hybrid Recommender Systems

[Построение длинного и короткого портфеля акций с помощью
нового алгоритма listwise learn-to-ranking](https://arxiv.org/pdf/2104.12484.pdf)_(pdf)_

[Recommender system using Bayesian personalized ranking](https://towardsdatascience.com/recommender-system-using-bayesian-personalized-ranking-d30e98bba0b9)_(towardsdatascience)_

[Intro to WARP Loss, automatic differentiation and PyTorch](https://medium.com/@gabrieltseng/intro-to-warp-loss-automatic-differentiation-and-pytorch-b6aa5083187a)_(towardsdatascience)_

In [2]:
!conda install lightfm

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [100]:
import os, sys
import pandas as pd
import numpy as np
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

from lightfm import LightFM
from lightfm.evaluation import precision_at_k


module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import prefilter_items
print('Done')

Done


In [82]:
data = pd.read_csv('../../retail_train.csv')

item_features = pd.read_csv('../../product.csv')
user_features = pd.read_csv('../../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [83]:
# prefilter data
print('Train Sample')
data_train_filtered = prefilter_items(data_train, take_n_popular=5000, item_features=item_features)
print()
print('Test Sample')
data_test_filtered = prefilter_items(data_test, take_n_popular=5000, item_features=item_features)


# prepare CSR train data
user_item_matrix = pd.pivot_table(data_train_filtered, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', 
                                  aggfunc='count', 
                                  fill_value=0
                                 ).astype(float) 
sparse_user_item = csr_matrix(user_item_matrix).tocsr()


# prepare CSR test data
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test_filtered, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
test_user_item_matrix = test_user_item_matrix.astype(float)


# prepare dicts
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))


# prepare user/items features
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)


# encoding features
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

Train Sample
== Starting prefilter info ==
shape: (2278490, 12)
# users: 2499
# items: 86865
Sparsity: 1.050%
== Ending prefilter info ==
shape: (1060465, 12)
# users: 2459
# items: 5000
Sparsity: 8.625%

Test Sample
== Starting prefilter info ==
shape: (118314, 12)
# users: 2042
# items: 24329
Sparsity: 0.238%
== Ending prefilter info ==
shape: (66494, 12)
# users: 1905
# items: 5000
Sparsity: 0.698%


## *Grid Search with **hyperopt***

In [23]:
# !pip install hyperopt

In [29]:
model.get_params()

{'loss': 'warp',
 'learning_schedule': 'adagrad',
 'no_components': 40,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.1,
 'user_alpha': 0.1,
 'random_state': RandomState(MT19937) at 0x7FC8E8B74E40}

In [91]:
def evaluate_model(model, filtered_data, user_features, item_features) -> float:
    user_item_matrix = pd.pivot_table(
        filtered_data, 
        index='user_id', columns='item_id', 
        values='quantity',
        aggfunc='count', 
        fill_value=0
    ).astype(float)
    sparse_user_item = csr_matrix(user_item_matrix).tocsr()

    user_feat = pd.DataFrame(user_item_matrix.index)
    user_feat = user_feat.merge(user_features, on='user_id', how='left')
    user_feat.set_index('user_id', inplace=True)
    item_feat = pd.DataFrame(user_item_matrix.columns)
    item_feat = item_feat.merge(item_features, on='item_id', how='left')
    item_feat.set_index('item_id', inplace=True)

    user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
    item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

    model.fit(
          (sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4,# A sum of an array that contains non-finite values
            # will also be non-finite, and we avoid creating a
            # large boolean temporary.
          verbose=False
          )

    p = precision_at_k(
        model, 
        sparse_user_item, 
        user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
        item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
        k=5).mean()

    return p, model

In [93]:
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
from math import e

def hyperopt_obj(params, data):
    t = params['type']
    del params['type']
    if t == 'lightfm':
        clf = LightFM(**params)
    else:
        return 0
    
    return evaluate_model(model=clf, 
                          filtered_data=data,
                          user_features=user_features,
                          item_features=item_features)

space = hp.choice('task', 
    [
        {'type': 'lightfm',
         'loss': hp.choice('loss', ['warp', 'bpr']),
         'no_components': hp.choice('n_components', range(40, 150)),
         'learning_rate': hp.choice('learning_rate', [.005, .01, .1, .15,]),
         'max_sampled': hp.choice('max_sampled', range(5, 16)),
         'epsilon': hp.uniform('epsilon', 1e-6, 1e-2),
         'item_alpha': hp.uniform('item_alpha', 1e-3, 0.1),
         'user_alpha': hp.uniform('user_alpha', 1e-3, 0.1),
         'random_state': hp.choice('random_state', [111])}
    ])

count = 0
best = 0
def f(params):
    global best, count
    count += 1
    acc, model = hyperopt_obj(params.copy(), data_train_filtered)
    acc_test = precision_at_k(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()
    if acc > best:
        print('new best:', acc, 'using', params['type'], sep=' ')
        best = acc
    if count % 50 == 0:
        print('iters:', count, ', acc:', acc, 'using', params, sep=' ')
    return {'loss': -acc, 'status': STATUS_OK}


trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=200, trials=trials)
print(f'best:\n{best}')

new best:
0.14485563
using
lightfm
new best:
0.30467674
using
lightfm
new best:
0.35290772
using
lightfm
new best:
0.3536397
using
lightfm
iters:
50
, acc:
0.347621
using
{'epsilon': 0.00042298141597467333, 'item_alpha': 0.00948529229349785, 'learning_rate': 0.1, 'loss': 'warp', 'max_sampled': 12, 'no_components': 133, 'random_state': 111, 'type': 'lightfm', 'user_alpha': 0.039538871217095393}
new best:
0.36445713
using
lightfm
iters:
100
, acc:
0.36445713
using
{'epsilon': 0.001081398222679707, 'item_alpha': 0.07624550897302623, 'learning_rate': 0.005, 'loss': 'warp', 'max_sampled': 8, 'no_components': 118, 'random_state': 111, 'type': 'lightfm', 'user_alpha': 0.024828070837681973}
new best:
0.37259048
using
lightfm
iters:
150
, acc:
0.26116312
using
{'epsilon': 0.0007773693800380998, 'item_alpha': 0.0787134782645475, 'learning_rate': 0.005, 'loss': 'warp', 'max_sampled': 8, 'no_components': 76, 'random_state': 111, 'type': 'lightfm', 'user_alpha': 0.027743744179105925}
iters:
200
, a

In [94]:
best = {'epsilon': 0.0011206811216958146, 
        'item_alpha': 0.07983765012392162, 
        'learning_rate': 0.005, 
        'loss': 'warp', 
        'max_sampled': 12, 
        'no_components': 44, 
        'random_state': 111, 
        'type': 'lightfm', 
        'user_alpha': 0.04263323414053811}
        
del best['type']

## *Init model*

In [95]:
model = LightFM(**best)

## *Fit Train*

In [96]:
%%time

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4,
          verbose=False) 

CPU times: user 4min 9s, sys: 1.85 s, total: 4min 11s
Wall time: 1min 20s


<lightfm.lightfm.LightFM at 0x7fc8d9af6b80>

## *Evaluation*

In [97]:
user_emb = model.get_user_representations(features=csr_matrix(user_feat_lightfm.values).tocsr())
item_emb = model.get_item_representations(features=csr_matrix(item_feat_lightfm.values).tocsr())

print('user_emb shapes:', user_emb[0].shape, user_emb[1].shape)
print('item_emb shapes:', item_emb[0].shape, item_emb[1].shape)

user_emb shapes: (2459,) (2459, 44)
item_emb shapes: (5000,) (5000, 44)


## *Train Precision*

In [98]:
%%time
train_precision = precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

train_precision 

CPU times: user 12.2 s, sys: 139 µs, total: 12.2 s
Wall time: 12.3 s


0.15803173

## *Test Precision*

In [99]:
test_precision = precision_at_k(model, csr_matrix(test_user_item_matrix).tocsr(), 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()

test_precision 

0.13703436