#Prepare sample dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://Ayatafoy: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@github.com/MakDaffi/RandDRecSys.git
!cd RandDRecSys && git checkout aromanov

In [None]:
!cp "/content/drive/MyDrive/Colab Notebooks/Kaggle/data.zip" /content
!unzip data.zip 

In [None]:
!cd RandDRecSys && pip install -r requirements.txt --force-reinstall
!pip uninstall bson -y

In [None]:
import sys
sys.path.append('/content/RandDRecSys/')
sys.path.append('/content/RandDRecSys/1st_stage_models')

In [None]:
from recommenders.utils.constants import SEED
from scripts.metrics.cross_validation import cross_validation, cross_validation_lgbm
from scripts.metrics.mapk import mapk
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from hyperopt import space_eval
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Last purchases popularity cross validation

In [None]:
from scripts.first_stage_models.LastPurchasesPopularity import LastPurchasesPopularity

In [None]:
space = {
    "num_weeks": hp.choice('num_weeks', list(range(1, 30)))
}

In [None]:
def objective(params):
  config = {
      'num_weeks': params['num_weeks'],
  }
  model = LastPurchasesPopularity(config)
  cv = cross_validation(
      model=model,
      metric=mapk,
      num_folds=5,
      path_to_transactions='/content/data/transactions.parquet',
      path_to_labels_folder='/content/data/ranker_train_labels/'
  )
  mean_cv = np.mean(cv)
  print('------------------')
  print('MAP@12:', mean_cv)
  print('Params', params)
  print('------------------')
  return {'loss': -mean_cv, 'status': STATUS_OK, 'metrics': mean_cv}

In [None]:
trials = Trials()

best = fmin(
    fn=objective,
    space = space, 
    algo=tpe.suggest, 
    max_evals=20, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
lpp_best_config = space_eval(space, best)

In [None]:
lpp_best_config = {
    'num_weeks': 3
}

In [None]:
model = LastPurchasesPopularity(lpp_best_config)
cv = cross_validation(
    model=model,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

In [None]:
cv

[0.017960453624700758,
 0.021046465504951255,
 0.019447218965448133,
 0.02411957241961997,
 0.021247798406847936]

In [None]:
np.mean(cv)

0.02184548204570087

#BiVAE cross validation

In [None]:
from scripts.first_stage_models.BiVAERecommender import BiVAERecommender

In [None]:
space = {
    "batch_size": hp.choice('batch_size', list(range(64, 201))),
    "act_fn": hp.choice('act_fn', ['sigmoid', 'tanh', 'elu', 'relu', 'relu6']),
    "likelihood": hp.choice('likelihood', ['gaus', 'bern', 'pois']),
    'num_epochs': hp.randint('num_epochs', 600),
    'lr': hp.uniform('lr', 1e-5, 1e-2),
    'beta_kl': hp.uniform('beta_kl', 0.0, 2.0),
    "encoder_dims": hp.choice('encoder_dims', list(range(20, 100))),
    "latent_dim": hp.choice('latent_dim', list(range(20, 100))),
    "user_frequency_threshold": hp.choice('user_frequency_threshold', list(range(5, 100))),
    "item_frequency_threshold": hp.choice('item_frequency_threshold', list(range(5, 100)))
}

In [None]:
def objective(params):
  config = {
      'user_frequency_threshold': params['user_frequency_threshold'],
      'item_frequency_threshold': params['item_frequency_threshold'],
      'latent_dim': params['latent_dim'],
      'encoder_dims': [params['encoder_dims']],
      'act_fn': params['act_fn'],
      'likelihood': params['likelihood'],
      'num_epochs': params['num_epochs'],
      'batch_size': params['batch_size'],
      'num_epochs': params['num_epochs'],
      'lr': params['lr'],
      'beta_kl': params['beta_kl'],
      'seed': SEED,
      'gpu': torch.cuda.is_available(),
      'verbose': True
  }
  model = BiVAERecommender(config)
  cv = cross_validation(
      model=model,
      metric=mapk,
      num_folds=5,
      path_to_transactions='/content/data/transactions.parquet',
      path_to_labels_folder='/content/data/ranker_train_labels/'
  )
  mean_cv = np.mean(cv)
  print('------------------')
  print('MAP@12:', mean_cv)
  print('Params', params)
  print('------------------')
  return {'loss': -mean_cv, 'status': STATUS_OK, 'metrics': mean_cv}


In [None]:
trials = Trials()

best = fmin(
    fn=objective,
    space = space, 
    algo=tpe.suggest, 
    max_evals=1, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
bivae_best_config = space_eval(space, best)
bivae_best_config['seed'] = SEED
bivae_best_config['gpu'] = torch.cuda.is_available()
bivae_best_config['verbose'] = True
bivae_best_config['encoder_dims'] = [bivae_best_config['encoder_dims']]

In [None]:
bivae_best_config = {
  'act_fn': 'sigmoid',
  'batch_size': 96,
  'beta_kl': 1.6324305360560756,
  'encoder_dims': [54],
  'item_frequency_threshold': 60,
  'latent_dim': 43,
  'likelihood': 'bern',
  'lr': 0.005997693826960126,
  'num_epochs': 141,
  'user_frequency_threshold': 38,
  'seed': 42,
  'gpu': True,
  'verbose': True
}

In [None]:
model = BiVAERecommender(bivae_best_config)
cv = cross_validation(
    model=model,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

In [None]:
cv

[0.001056067588325653,
 0.00026408450704225355,
 0.0048071687222630615,
 0.0060224089635854345,
 0.004938888888888889]

In [None]:
np.mean(cv)

0.003417723734021058

#SVD cross validation

In [None]:
from scripts.first_stage_models.SVDRecommender import SVDRecommender

In [None]:
space = {
    'k': hp.randint('k', 10),
    'learning_rate': hp.uniform('learning_rate', 1e-5, 1e-2),
    'regularizer': hp.uniform('regularizer', 1e-5, 1e-2),
    'iterations': hp.randint('iterations', 300),
    'method': hp.choice('method', ['stochastic']),
    'bias': hp.choice('bias', [True, False]),
    'num_weeks': hp.choice('num_weeks', list(range(3, 8)))
}

In [None]:
def objective(params):
  config = {
      'k': params['k'],
      'learning_rate': params['learning_rate'],
      'regularizer': params['regularizer'],
      'iterations': params['iterations'],
      'method': params['method'],
      'bias': params['bias'],
      'num_weeks': params['num_weeks'],
      'verbose': False
  }
  model = SVDRecommender(config)
  cv = cross_validation(
      model=model,
      metric=mapk,
      num_folds=5,
      path_to_transactions='/content/data/transactions.parquet',
      path_to_labels_folder='/content/data/ranker_train_labels/'
  )
  mean_cv = np.mean(cv)
  print('------------------')
  print('MAP@12:', mean_cv)
  print('Params', params)
  print('------------------')
  return {'loss': -mean_cv, 'status': STATUS_OK, 'metrics': mean_cv}


In [None]:
trials = Trials()

best = fmin(
    fn=objective,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
svd_best_config = space_eval(space, best)
svd_best_config['verbose'] = False

In [None]:
svd_best_config = {
    'bias': False,
    'iterations': 180,
    'k': 2,
    'learning_rate': 0.007659023027711605,
    'method': 'stochastic',
    'num_weeks': 7,
    'regularizer': 0.0038804778747317226,
    'verbose': False
}

In [None]:
model = SVDRecommender(svd_best_config)
cv = cross_validation(
    model=model,
    metric=mapk,
    num_folds=5,
    path_to_transactions='/content/data/transactions.parquet',
    path_to_labels_folder='/content/data/ranker_train_labels/'
)

In [None]:
cv

[0.021085008877885312,
 0.02359039808472701,
 0.023511776444409133,
 0.02872756518600466,
 0.02449473030102706]

In [None]:
np.mean(cv)

0.024281895778810635

#Make predictions for the second stage model

In [None]:
from scripts.utils import create_predictions_for_second_stage
import pandas as pd

In [None]:
PATH_TO_TRANSACTIONS = "/content/data/transactions.parquet"
PATH_TO_CUSTOMERS = "/content/data/customers.parquet"

In [None]:
df_customers = pd.read_parquet(PATH_TO_CUSTOMERS)
df_transactions = pd.read_parquet(PATH_TO_TRANSACTIONS)

In [None]:
create_predictions_for_second_stage(
    model_name='lpp',
    config=lpp_best_config,
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save='/content/data/ranker_train_set/',
    num_train_weeks= 20,
    top_k=12
)

In [None]:
create_predictions_for_second_stage(
    model_name='bivae',
    config=svd_best_config,
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save='/content/data/ranker_train_set/',
    num_train_weeks= 20,
    top_k=12
)

In [None]:
create_predictions_for_second_stage(
    model_name='svd',
    config=''
    transactions=df_transactions,
    all_customers=df_customers['customer_id'].unique(),
    path_to_destination_save='/content/data/ranker_train_set/',
    num_train_weeks= 20,
    top_k=12
)

In [None]:
!zip -r /content/final_data.zip data
!cp /content/final_data.zip "/content/drive/MyDrive/Colab Notebooks/Kaggle"