#Prepare sample dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -r /content/RandDRecSys

In [None]:
!git clone https://Ayatafoy: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@github.com/MakDaffi/RandDRecSys.git
!cd RandDRecSys && git checkout aromanov

Cloning into 'RandDRecSys'...
remote: Enumerating objects: 379, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 379 (delta 56), reused 66 (delta 34), pack-reused 288[K
Receiving objects: 100% (379/379), 4.08 MiB | 30.75 MiB/s, done.
Resolving deltas: 100% (200/200), done.
Branch 'aromanov' set up to track remote branch 'aromanov' from 'origin'.
Switched to a new branch 'aromanov'


In [None]:
!cp "/content/drive/MyDrive/Colab Notebooks/Kaggle/data.zip" /content
!unzip data.zip 

In [None]:
!cd RandDRecSys && pip install -r requirements.txt --force-reinstall
!pip uninstall bson -y

In [None]:
import sys
sys.path.append('/content/RandDRecSys/')
sys.path.append('/content/RandDRecSys/1st_stage_models')

In [None]:
from recommenders.utils.constants import SEED
from scripts.metrics.cross_validation import cross_validation, cross_validation_lgbm
from scripts.metrics.mapk import mapk
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from hyperopt import space_eval
import numpy as np
import pandas as pd
import shutil
import warnings
warnings.filterwarnings('ignore')

In [None]:
from scripts.first_stage_models.LastPurchasesPopularity import LastPurchasesPopularity
from scripts.first_stage_models.BiVAERecommender import BiVAERecommender
from scripts.first_stage_models.SVDRecommender import SVDRecommender
from scripts.utils import create_predictions_for_second_stage, create_labels_for_second_stage

In [None]:
lpp_best_config = {
    'num_weeks': 3
}

In [None]:
bivae_best_config = {
  'act_fn': 'sigmoid',
  'batch_size': 96,
  'beta_kl': 1.6324305360560756,
  'encoder_dims': [54],
  'item_frequency_threshold': 60,
  'latent_dim': 43,
  'likelihood': 'bern',
  'lr': 0.005997693826960126,
  'num_epochs': 141,
  'user_frequency_threshold': 38,
  'seed': 42,
  'gpu': True,
  'verbose': False
}

In [None]:
svd_best_config = {
    'bias': False,
    'iterations': 180,
    'k': 2,
    'learning_rate': 0.007659023027711605,
    'method': 'stochastic',
    'num_weeks': 7,
    'regularizer': 0.0038804778747317226,
    'verbose': False
}

In [None]:
PATH_TO_TRANSACTIONS = "/content/data/transactions.parquet"
PATH_TO_CUSTOMERS = "/content/data/customers.parquet"

In [None]:
df_customers = pd.read_parquet(PATH_TO_CUSTOMERS)
df_transactions = pd.read_parquet(PATH_TO_TRANSACTIONS)

#First stage CV

In [None]:
PATH_TO_LABELS = '/content/data/ranker_train_labels'
PATH_TO_RANKER_TRAIN_SET = '/content/data/ranker_train_set/'

In [None]:
create_labels_for_second_stage(
    transactions=df_transactions, 
    path_to_destination_save=PATH_TO_LABELS,
    num_train_weeks=20,
    top_k=12
)

In [None]:
lpp_cv = cross_validation(
    model_name='lpp',
    config=lpp_best_config,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

In [None]:
bivae_cv = cross_validation(
    model_name='bivae',
    config=bivae_best_config,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

In [None]:
svd_cv = cross_validation(
    model_name='svd',
    config=svd_best_config,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

In [None]:
np.mean(lpp_cv)

0.020718563498927794

In [None]:
np.mean(bivae_cv)

0.003417723734021058

In [None]:
np.mean(svd_cv)

0.02353109177184844

#Second stage model CV

In [None]:
from lightgbm import LGBMRanker
from scripts.utils import combine_train_sets_and_labels

In [None]:
lpp_models = create_predictions_for_second_stage(
    model_name='lpp',
    config=lpp_best_config,
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
    num_train_weeks=20,
    top_k=12
)
bivae_models = create_predictions_for_second_stage(
    model_name='bivae',
    config=bivae_best_config,
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
    num_train_weeks=20,
    top_k=12
)
svd_models = create_predictions_for_second_stage(
    config=svd_best_config,
    model_name='svd',
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
    num_train_weeks=20,
    top_k=12
)

In [None]:
space = {
    'bivae_top_k': hp.choice('bivae_top_k', list(range(3, 32))),
    'lpp_top_k': hp.choice('lpp_top_k', list(range(3, 32))),
    'svd_top_k': hp.choice('svd_top_k', list(range(3, 32)))
}

In [None]:
def objective(params):
  # shutil.rmtree(PATH_TO_LABELS)
  # shutil.rmtree(PATH_TO_RANKER_TRAIN_SET)
  model_names=[
    'lpp',
    'bivae',
    'svd'
  ]
  ranker_config = {
      'boosting_type': 'dart',
      'max_depth': 7,
      'n_estimators': 100,
      'importance_type': 'gain',
  }
  create_predictions_for_second_stage(
      model_name='lpp',
      config=lpp_best_config,
      transactions=df_transactions,
      all_customers=df_customers['customer_id'].unique(),
      path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
      num_train_weeks=20,
      top_k=params['lpp_top_k'],
      models=lpp_models
  )
  create_predictions_for_second_stage(
      model_name='bivae',
      config=bivae_best_config,
      transactions=df_transactions,
      all_customers=df_customers['customer_id'].unique(),
      path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
      num_train_weeks=20,
      top_k=params['bivae_top_k'],
      models=bivae_models
  )
  create_predictions_for_second_stage(
      model_name='svd',
      config=svd_best_config,
      transactions=df_transactions,
      all_customers=df_customers['customer_id'].unique(),
      path_to_destination_save=PATH_TO_RANKER_TRAIN_SET,
      num_train_weeks=20,
      top_k=params['svd_top_k'],
      models=svd_models
  )
  create_labels_for_second_stage(
      transactions=df_transactions, 
      path_to_destination_save=PATH_TO_LABELS,
      num_train_weeks=20,
      top_k=params['svd_top_k']
  )
  train_w_labels = combine_train_sets_and_labels(
    path_to_train_set=PATH_TO_RANKER_TRAIN_SET,
    path_to_train_labels=PATH_TO_LABELS,
    model_names=model_names,
    num_train_weeks=20
  )
  train_w_labels['label'] = train_w_labels['label'].apply(lambda x: 1 if x > 0 else 0)
  cv = cross_validation_lgbm(
    ranker_config=ranker_config,
    metric=mapk,
    num_folds=5,
    model_names=model_names,
    train_w_labels=train_w_labels,
    path_to_labels=PATH_TO_LABELS
  )
  mean_cv = np.mean(cv)
  print('------------------')
  print('MAP@12:', mean_cv)
  print('Params', params)
  return {'loss': -mean_cv, 'status': STATUS_OK, 'metrics': mean_cv}

In [None]:
trials = Trials()

best = fmin(
    fn=objective,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))

------------------
MAP@12:
0.021123536306926226
Params
{'bivae_top_k': 14, 'lpp_top_k': 25, 'svd_top_k': 5}
------------------
MAP@12:
0.020221301540571186
Params
{'bivae_top_k': 24, 'lpp_top_k': 27, 'svd_top_k': 17}
------------------
MAP@12:
0.019333877999355636
Params
{'bivae_top_k': 13, 'lpp_top_k': 25, 'svd_top_k': 30}
------------------
MAP@12:
0.021070630192757324
Params
{'bivae_top_k': 21, 'lpp_top_k': 12, 'svd_top_k': 9}
------------------
MAP@12:
0.02000097720049055
Params
{'bivae_top_k': 7, 'lpp_top_k': 20, 'svd_top_k': 20}
------------------
MAP@12:
0.02081847532803398
Params
{'bivae_top_k': 24, 'lpp_top_k': 13, 'svd_top_k': 10}
------------------
MAP@12:
0.02165185710944689
Params
{'bivae_top_k': 5, 'lpp_top_k': 25, 'svd_top_k': 7}
------------------
MAP@12:
0.020231456328366208
Params
{'bivae_top_k': 11, 'lpp_top_k': 22, 'svd_top_k': 25}
------------------
MAP@12:
0.021765108217598216
Params
{'bivae_top_k': 5, 'lpp_top_k': 30, 'svd_top_k': 6}
------------------
MAP@12:
0.