# Training

In [1]:
!git clone https://github.com/adhiiisetiawan/nbr-time_aware_item_weighting
%cd nbr-time_aware_item_weighting/
!pip install -r requirements.txt --quiet

Cloning into 'nbr-time_aware_item_weighting'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 98 (delta 6), reused 18 (delta 4), pack-reused 77[K
Unpacking objects: 100% (98/98), 54.89 MiB | 9.43 MiB/s, done.
/kaggle/working/nbr-time_aware_item_weighting
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
beatrix-jupyterlab 2023.621.222118 requires jupyter-server~=1.16, but you have jupyter-server 2.6.0 which is incompatible.
chex 0.1.82 requires numpy>=1.25.0, but you have numpy 1.22.4 which is incompatible.
fitter 1.6.0 requires pandas<3.0.0,>=2.0.3, but you have pandas 1.5.3 which is incompatible.
fitter 1.6.0 requires tqdm<5.0.0,>=4.65.1, but you have

In [2]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import BPR, SLRC, NBRKNN, RepurchaseModule
import torch
import random
import numpy as np
import optuna
import warnings
warnings.filterwarnings("ignore")

In [3]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [4]:
!mkdir /kaggle/working/epm-prep
!cp /kaggle/input/epm-prep/EPM-prep.txt /kaggle/working/epm-prep

In [5]:
corpus_path = "/kaggle/working/epm-prep/"
dataset_name = "EPM-prep"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(0, 0, filt=False)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 465, #items = 1477, #clicks = 518655 (#illegal records = 0)
After preprocessing: #users = 465, #items = 1477, #clicks = 518655
Saving dataset in /kaggle/working/epm-prep//data_EPM-prep/...


In [6]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

In [7]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=100,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 465/465 [00:00<00:00, 3736.52it/s]


dev dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 159.84it/s]


test dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 160.21it/s]


In [8]:
slrc_best_params = {'batch_size': 256, 'lr': 0.00011201144001505824, 'l2_reg_coef': 0.00011498224071460201}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)
trainer.train()

Epoch 1:


Batch loss = 0.488323: 100%|██████████| 2009/2009 [00:30<00:00, 65.81it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 190.09it/s]


 {'precision': 0.1163355408388521, 'recall': 0.336101357398414, 'ndcg': 0.2532928957482408}
Epoch 2:



Batch loss = 0.485001: 100%|██████████| 2009/2009 [00:30<00:00, 66.00it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 214.35it/s]


 {'precision': 0.11655629139072848, 'recall': 0.3339989711900675, 'ndcg': 0.2530833708195809}
Epoch 3:



Batch loss = 0.482208: 100%|██████████| 2009/2009 [00:30<00:00, 66.41it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 210.87it/s]


 {'precision': 0.11655629139072847, 'recall': 0.3337782206381912, 'ndcg': 0.2526803670881091}
Epoch 4:



Batch loss = 0.479917: 100%|██████████| 2009/2009 [00:29<00:00, 66.98it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 216.77it/s]


 {'precision': 0.1163355408388521, 'recall': 0.33553604910683643, 'ndcg': 0.2536662786434289}
Epoch 5:



Batch loss = 0.4781: 100%|██████████| 2009/2009 [00:30<00:00, 66.86it/s]  


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 209.95it/s]


 {'precision': 0.11589403973509933, 'recall': 0.33282922686359034, 'ndcg': 0.2535918676510746}
Epoch 6:



Batch loss = 0.476693: 100%|██████████| 2009/2009 [00:30<00:00, 66.51it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 222.71it/s]


 {'precision': 0.1163355408388521, 'recall': 0.33394173956550693, 'ndcg': 0.25427135270451645}
Epoch 7:



Batch loss = 0.475605: 100%|██████████| 2009/2009 [00:30<00:00, 66.92it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 206.26it/s]


 {'precision': 0.11677704194260487, 'recall': 0.33422717892379394, 'ndcg': 0.25469569958241484}
Epoch 8:



Batch loss = 0.474743: 100%|██████████| 2009/2009 [00:29<00:00, 67.05it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 206.66it/s]


 {'precision': 0.11699779249448125, 'recall': 0.33453728088952506, 'ndcg': 0.2545190105553853}
Epoch 9:



Batch loss = 0.474031: 100%|██████████| 2009/2009 [00:29<00:00, 67.05it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 212.42it/s]


 {'precision': 0.1163355408388521, 'recall': 0.3337656759512535, 'ndcg': 0.25598752590458335}
Epoch 10:



Batch loss = 0.473415: 100%|██████████| 2009/2009 [00:29<00:00, 67.09it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 207.91it/s]


 {'precision': 0.11611479028697572, 'recall': 0.33580039426342045, 'ndcg': 0.2547083614754228}
Epoch 11:



Batch loss = 0.472862: 100%|██████████| 2009/2009 [00:30<00:00, 66.92it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 217.22it/s]


 {'precision': 0.11523178807947021, 'recall': 0.3335718648825732, 'ndcg': 0.2525921218037337}
Epoch 12:



Batch loss = 0.472349: 100%|██████████| 2009/2009 [00:29<00:00, 67.87it/s]


Evaluation (dev):



100%|██████████| 453/453 [00:02<00:00, 217.59it/s]


 {'precision': 0.1152317880794702, 'recall': 0.334086949503618, 'ndcg': 0.2515599525230275}





RepurchaseModule()

In [9]:
!cp /kaggle/working/nbr-time_aware_item_weighting/best_checkpoint.pth /kaggle/working/nbr-time_aware_item_weighting/best-epm.pth

In [10]:
dev_user_emb = trainer.get_predictions(mode="dev")
test_user_emb = trainer.get_predictions(mode="test")

100%|██████████| 453/453 [00:02<00:00, 219.06it/s]
100%|██████████| 453/453 [00:02<00:00, 221.88it/s]


In [11]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 465/465 [00:00<00:00, 3710.22it/s]


dev dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 164.64it/s]


test dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 161.15it/s]


In [12]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

[32m[I 2023-11-06 04:25:04,524][0m A new study created in memory with name: no-name-0eb670a0-f31a-4fce-acb8-7c6c1d642c93[0m
100%|██████████| 453/453 [00:01<00:00, 432.21it/s]
[32m[I 2023-11-06 04:25:05,617][0m Trial 0 finished with value: 0.10235132866320823 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.10235132866320823.[0m
100%|██████████| 453/453 [00:01<00:00, 441.12it/s]
[32m[I 2023-11-06 04:25:06,681][0m Trial 1 finished with value: 0.2400623923858455 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.2400623923858455.[0m
100%|██████████| 453/453 [00:01<00:00, 421.58it/s]
[32m[I 2023-11-06 04:25:07,792][0m Trial 2 finished with value: 0.15299845632990838 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.2400623923858455.[0m
100%|██████████| 453/453 [00:00<00:00, 508.96it/s]
[32m[I 2023-11-06 04:25:08,719][0m Trial 3 finished with value: 0.

In [13]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

params = {
    "model": NBRKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        alpha=study.best_params["alpha"],
        user_emb=dev_user_emb
    )
}

params["model"].set_emb(test_user_emb)

trainer.init_hyperparams(**params)
metrics = trainer.evaluate(mode="test")
metrics

train dataset preparing...


100%|██████████| 465/465 [00:00<00:00, 3757.00it/s]


dev dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 163.05it/s]


test dataset preparing...


100%|██████████| 453/453 [00:02<00:00, 162.53it/s]
100%|██████████| 453/453 [00:01<00:00, 443.91it/s]


{'precision': 0.11567328918322298,
 'recall': 0.3663527527189077,
 'ndcg': 0.2778382944080823}

In [14]:
import pandas as pd

data = pd.read_csv('/kaggle/input/epm-prep/EPM.csv')

In [16]:
def inference(user, topk):
    n_items = corpus.n_items

    items_scores = trainer.model.predict_for_user(user).view(-1, n_items)
    top_items = torch.topk(items_scores, k=topk, dim=1, sorted=True).indices
    top_score = [float(i) for i in items_scores[0]]

    print(sorted(data['customer_name'].unique())[user])
    print("Item  | Score")
    print('-------------')
    for i in top_items[0]:
        print("{:5} | {:<5}".format(sorted(data['item_code'].unique())[int(i)], round(top_score[i], 3)))
    print()

top_k = 5

for i in [1, 12, 123]:
    inference(i, top_k)

JK1 - AP.  ARINI_GROUP_NA
Item  | Score
-------------
49401 | 5.27 
1650G | 4.772
2717H | 4.347
AVCRA | 4.175
23397 | 4.09 

JK1 - AP. FARZANA_GROUP_NA
Item  | Score
-------------
10224 | 16.191
49401 | 15.632
18350 | 15.004
23397 | 14.292
BLMMN | 13.401

JK1-AP. ESTETIKA_GROUP_NA
Item  | Score
-------------
DZCSB | 5.093
BMTLD | 1.845
CEXLB | 1.446
MCSYB | 1.18 
CGM40 | 1.07 



# Load

In [1]:
# !git clone https://github.com/adhiiisetiawan/nbr-time_aware_item_weighting
# %cd nbr-time_aware_item_weighting/
!pip install -r ../requirements.txt --quiet

DEPRECATION: celery 4.4.1 has a non-standard dependency specifier pytz>dev. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of celery or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [2]:
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import NBRKNN, RepurchaseModule
import torch
import random
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [4]:
corpus_path = "data/"
dataset_name = "EPM-prep"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(0, 0, filt=False)
save_split(corpus_path, dataset_name, preprocessor)

corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Before preprocessing: #users = 465, #items = 1477, #clicks = 518655 (#illegal records = 0)
After preprocessing: #users = 465, #items = 1477, #clicks = 518655
Saving dataset in data//data_EPM-prep/...


In [6]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=100,
    topk=10,
    early_stop_num=3
)

slrc_best_params = {'batch_size': 256, 'lr': 0.00011201144001505824, 'l2_reg_coef': 0.00011498224071460201}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)
with open("best_checkpoint.pth", "rb") as f:
    checkpoint = torch.load(f)
    trainer.model.load_state_dict(checkpoint)

train dataset preparing...


  0%|          | 0/465 [00:00<?, ?it/s]

100%|██████████| 465/465 [00:00<00:00, 1186.24it/s]


dev dataset preparing...


100%|██████████| 453/453 [00:06<00:00, 64.90it/s] 


test dataset preparing...


100%|██████████| 453/453 [00:05<00:00, 79.22it/s] 


In [7]:
dev_user_emb = trainer.get_predictions(mode="dev")
test_user_emb = trainer.get_predictions(mode="test")

100%|██████████| 453/453 [00:12<00:00, 37.50it/s]
100%|██████████| 453/453 [00:10<00:00, 43.36it/s]


In [8]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

# HARD CODED based on training process
best_study = {
    'nearest_neighbors_num': 138,
    'alpha': 1.0
}

params = {
    "model": NBRKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        nearest_neighbors_num=best_study["nearest_neighbors_num"],
        alpha=best_study["alpha"],
        user_emb=dev_user_emb
    )
}
params["model"].set_emb(test_user_emb)

trainer.init_hyperparams(**params)
trainer.evaluate(mode='test')

train dataset preparing...


100%|██████████| 465/465 [00:00<00:00, 2057.55it/s]


dev dataset preparing...


100%|██████████| 453/453 [00:05<00:00, 87.28it/s] 


test dataset preparing...


100%|██████████| 453/453 [00:05<00:00, 81.38it/s] 
100%|██████████| 453/453 [00:03<00:00, 150.82it/s]


{'precision': 0.11567328918322298,
 'recall': 0.3663527527189077,
 'ndcg': 0.2778382944080823}

In [9]:
import pandas as pd

data = pd.read_csv('data/EPM.csv')

In [10]:
def inference(user, topk):
    n_items = corpus.n_items

    items_scores = trainer.model.predict_for_user(user).view(-1, n_items)
    top_items = torch.topk(items_scores, k=topk, dim=1, sorted=True).indices
    top_score = [float(i) for i in items_scores[0]]

    print(sorted(data['customer_name'].unique())[user])
    print("Item  | Score")
    print('-------------')
    for i in top_items[0]:
        print("{:5} | {:<5}".format(sorted(data['item_code'].unique())[int(i)], round(top_score[i], 3)))
    print()

top_k = 5

for i in [1, 12, 123]:
    inference(i, top_k)

JK1 - AP.  ARINI_GROUP_NA
Item  | Score
-------------
49401 | 5.27 
1650G | 4.772
2717H | 4.347
AVCRA | 4.175
23397 | 4.09 

JK1 - AP. FARZANA_GROUP_NA
Item  | Score
-------------
10224 | 16.191
49401 | 15.632
18350 | 15.004
23397 | 14.292
BLMMN | 13.401

JK1-AP. ESTETIKA_GROUP_NA
Item  | Score
-------------
DZCSB | 5.093
BMTLD | 1.845
CEXLB | 1.446
MCSYB | 1.18 
CGM40 | 1.07 

