### Preparation

In [1]:
import pandas as pd

data = pd.read_csv('/kaggle/input/emos-prep/EMOS.csv')

In [2]:
!git clone https://github.com/KalbeDigitalLab/next-basket-recommendation_time-aware
%cd next-basket-recommendation_time-aware/
!pip install -r requirements.txt -q
!pip install sentence-transformers -q
!pip install numpy --upgrade -q

Cloning into 'next-basket-recommendation_time-aware'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 170 (delta 6), reused 8 (delta 3), pack-reused 153[K
Receiving objects: 100% (170/170), 55.37 MiB | 15.91 MiB/s, done.
Resolving deltas: 100% (37/37), done.
Updating files: 100% (78/78), done.
/kaggle/working/next-basket-recommendation_time-aware
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.8.2 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
albumentations 1.4.0 requires numpy>=1.24.4, but you have numpy 1.22.4 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have di

In [3]:
import sys
sys.path.append("..")

from nbr.common import get_precision, get_recall, get_ndcg
from nbr.model import NBRKNN, RepurchaseModule
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

import numpy as np
import random
import torch
import warnings

warnings.filterwarnings("ignore")

In [4]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [5]:
!mkdir /kaggle/working/emos-prep
!cp /kaggle/input/emos-prep/EMOS-prep.txt /kaggle/working/emos-prep

In [6]:
# TODO: Set apakah data perlu mem filter user dan item yang data transaksinya cukup sedikit
# Jika iya, min_user adalah minimal jumlah transaksi tiap user
# dan min_item minimal jumlah transaksi tiap item

filter = False
min_user = 0
min_item = 0

corpus_path = "/kaggle/working/emos-prep/"
dataset_name = "EMOS-prep"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(min_user, min_item, filt=filter)
save_split(corpus_path, dataset_name, preprocessor)
del preprocessor

Before preprocessing: #users = 55596, #items = 3422, #clicks = 10524887 (#illegal records = 0)
After preprocessing: #users = 55596, #items = 3422, #clicks = 10524887
Saving dataset in /kaggle/working/emos-prep//data_EMOS-prep/...


In [7]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

# Cek transaksi user 0
# len(corpus.book[0]), corpus.book[0]
len(corpus.book[0])

16

### Load Repurchase Module

This notebook uses pre-trained model for the Repurchase Module part. If you want to train the model, you can use in the `TAIWI-emos.ipynb`.

In [8]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=100,
    topk=10,
    early_stop_num=3
)

# slrc_best_params = {'batch_size': 256, 'lr': 0.00011201144001505824, 'l2_reg_coef': 0.00011498224071460201}
slrc_best_params = {'batch_size': 256, 'lr': 0.00001, 'l2_reg_coef': 0.01}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)
with open("/kaggle/input/emos-prep/best_checkpoint.pth", "rb") as f:
    checkpoint = torch.load(f)
    trainer.model.load_state_dict(checkpoint)

train dataset preparing...


100%|██████████| 55596/55596 [00:59<00:00, 930.84it/s]


dev dataset preparing...


100%|██████████| 49168/49168 [01:19<00:00, 621.95it/s] 


test dataset preparing...


100%|██████████| 49168/49168 [01:22<00:00, 596.45it/s] 


In [9]:
# TODO: Set tanggal tertentu yang ingin diprediksi, DEFAULT nya adalah tanggal terakhir pembelian tiap item

# date_to_predict = "2023-12-29"
date_to_predict = None

test_user_emb, test_data = trainer.get_predictions(mode="test", timestamp=date_to_predict)
print('user_id:      ', test_data[0]['user_id'][0])
print('predict_time: ', test_data[0]['t'][0])
print(test_user_emb[0][:10])
print(test_data[0]['length'][:10])

del trainer, test_data

100%|██████████| 49168/49168 [07:41<00:00, 106.65it/s]


user_id:       tensor(0)
predict_time:  tensor(19630., dtype=torch.float64)
[3.23156183e-136 2.38299906e-071 2.85903533e-158 1.28597604e-058
 1.59666785e-015 7.39043365e-016 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000]
tensor([1., 2., 1., 4., 3., 3., 0., 0., 0., 0.], dtype=torch.float64)


### Train Neighborhood Module (Base Model)

In [21]:
trainer2 = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

In [29]:
# Hyperparameter Tuning for Alpha and Num Neighbors
x = [i for i in range(20, 101, 20)]
y = [i / 10 for i in range(2, 11, 2)]
scores = []

for j in y:
    sc = []
    for i in x:
        params2 = {
            "model": NBRKNN(
                item_num=trainer2.corpus.n_items,
                user_num=trainer2.corpus.n_users,
                nearest_neighbors_num=i,
                alpha=j,
                user_emb=test_user_emb
            )
        }

        trainer2.init_hyperparams(**params2)
        metrics = trainer2.evaluate(mode="test")
        score = metrics['ndcg']
        sc.append(score)
    scores.append(sc)

for i in range(len(y)):
    for j in range(len(x)):
        print(y[i], x[j], scores[i][j])
    print()


0.2 20 0.05785389666825502
0.2 40 0.05691591829785123
0.2 60 0.053917713908474996
0.2 80 0.056093853417267844
0.2 100 0.05623027768645418

0.4 20 0.05833025092339749
0.4 40 0.05754026191265765
0.4 60 0.054531497553505774
0.4 80 0.056676290138430424
0.4 100 0.05682330497011047

0.6 20 0.05891525315646821
0.6 40 0.058107258324397094
0.6 60 0.05516513531101642
0.6 80 0.057208231030789036
0.6 100 0.05742357384798719

0.8 20 0.05964867235679014
0.8 40 0.06100472493172865
0.8 60 0.055732577546437664
0.8 80 0.05785398936865899
0.8 100 0.05798447128973061

1.0 20 0.23619340692858368
1.0 40 0.23619340692858368
1.0 60 0.23619340692858368
1.0 80 0.23619340692858368
1.0 100 0.23619340692858368



In [None]:
params2 = {
    "model": NBRKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        nearest_neighbors_num=40,
        alpha=0.8,
        user_emb=test_user_emb
    )
}

trainer2.init_hyperparams(**params2)
metrics = trainer2.evaluate(mode="test")
metrics

 19%|█▉        | 9450/49168 [01:28<06:09, 107.37it/s]

In [None]:
def inference(user, topk):
    n_items = corpus.n_items

    items_scores = trainer2.model.predict_for_user(user).view(-1, n_items)
    top_items = torch.topk(items_scores, k=topk, dim=1, sorted=True).indices
    top_score = [float(i) for i in items_scores[0]]

    print(sorted(data['ship_to_id_emos'].unique())[user])
#     print("Item  | Score")
#     print('-------------')
    total = sum(items_scores[0])
    results = []
    for i in top_items[0]:
        if top_score[i] == 0:
            break
        item = data['prod_id'].unique()[int(i)]
        results.append(item)
        dataItem = dataEmb[dataEmb['prod_id'] == item]['desc']
#         dataItem = []
        if len(dataItem) > 0:
            dataItem = dataItem.item()
        else:
            dataItem = 'None'
        print(item)
#         print(dataItem)
#         print("{:5} | {:<5} | {:<5} % | {}".format(item, round(top_score[i], 3), round(top_score[i] / float(total) * 100, 2), ""))    
    print()

    return results

top_k = 10
user_id = 61
# 1
# 2
# 61
# 73
inference(user_id, top_k)

In [None]:
# Generate All Prediction Data
dfInference1 = pd.DataFrame(columns=[i for i in range(10)])
for i in tqdm(range(50000)):
    
    res = inference(i, 10)
    while len(res) < 10:
        res.append(0)
    dfInference1.loc[len(dfInference1)] = res
    
dfInference1

### Embedding Experiment

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(2, 384)


In [11]:
dataE = pd.read_csv('/kaggle/input/emos-prep/master_product_epm_recsys_update.csv', sep=';')
dataE = dataE.fillna('')
dataE.head()

Unnamed: 0,prod_id,prod_name,category_product,classification_product,name_prin,name_brand,flag_kalbe
0,10137,F. SEASON MDCT,MEDICAL,ESSENTIAL OIL,FITTO NATURA MEDICA,SHEN NONG SI FOUR SEASON,NON-KALBE
1,10138,NFDEO RO BRIGHT,CONSUMER,10D. FEMININE DEODORANTS,BEIERSDORF INDONESIA,NIVEA,NON-KALBE
2,10139,GLISODIN 30 KAP,MEDICAL,LAINNYA,KALBE FARMA,GLISODIN,KALBE
3,10140,ENTRAMIX COKELA,CONSUMER,13.3 PANGAN OLAHAN UNTUK KEPERLUAN MEDIS KHUSU...,SANGHIANG PERKASA,ENTRAMIX,KALBE
4,10141,DANCOW 1+ MaduN,CONSUMER,"13.1 FORMULA BAYI, FORMULA LANJUTAN DAN FORMUL...",NESTLE INDONESIA,DANCOW,NON-KALBE


In [22]:
n_items = corpus.n_items

items_scores = trainer2.model.predict_for_user(1).view(-1, n_items)
top_items = torch.topk(items_scores, k=3422, dim=1, sorted=True).indices
top_items

cleaned_item = data['prod_id'].unique()
cleaned_item = sorted([cleaned_item[int(i)] for i in top_items[0]])
cleaned_item[:10]

[10137, 10138, 10139, 10140, 10141, 10144, 10145, 10148, 10149, 10150]

In [23]:
from tqdm.auto import tqdm

dataEmb = dataE.copy()
dataEmb['desc'] = dataEmb['prod_name'] + ' ' + dataEmb['category_product'] + ' ' + dataEmb['classification_product'] + ' ' + dataEmb['name_prin'] + ' ' + dataEmb['name_brand'] + ' ' + dataEmb['flag_kalbe']
dataEmb = dataEmb[['prod_id', 'desc']]
dataEmb = dataEmb[dataEmb['prod_id'].isin(cleaned_item)].reset_index(drop=True).fillna('')

item_emb = []
for i in tqdm(cleaned_item):
    dataEmbItem = dataEmb[dataEmb['prod_id'] == i].reset_index(drop=True)
    if len(dataEmbItem) > 0:
        item_emb.append(model.encode(dataEmbItem.loc[0, 'desc'], show_progress_bar=False))
    else:
        item_emb.append(model.encode('', show_progress_bar=False))

len(item_emb)

  0%|          | 0/3422 [00:00<?, ?it/s]

3422

In [24]:
# - Vector user per item dikali for each item embedding -> matriks user (item x dimensi) -> (?)ambil rata2 per dimensi
# - modification: user embedding per item -> user embedding per dimensi

#     I1  I2  I3
# U1  1   1   2
# U2  4   5   0 
# U3  9   1   6

# I1: [1, 1, 1, 10]
# I2: [2, 2, 2, 0]
# I3: [2, 1, 0, 7]


# U1: 1 * [1, 1, 1, 10] + 1 * [2, 2, 2, 0] + 2 * [2, 1, 0, 7] = [7, 5, 3, 24]

item_emb = np.array(item_emb)
emb_new = (np.dot(test_user_emb, item_emb)) # emb_new = (np.dot(test_user_emb, lst) / test_user_emb.shape[0])
emb_new.shape

(55596, 384)

### Train Neighborhood Module (Embedding Model)

In [1]:
trainer3 = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

params3 = {
    "model": NBRKNN(
        # item_num=corpus.n_items,
        item_num=item_emb.shape[1],
        user_num=corpus.n_users,
        nearest_neighbors_num=100,
        alpha=0.8,
        # user_emb=test_user_emb,
        user_emb=emb_new
    )
}

trainer3.init_hyperparams(**params3)
# metrics = trainer3.evaluate(mode="test")

In [None]:
precisions, recalls, ndcgs = [], [], []
trainer3.model.eval()
test_dataloader = trainer3.test_dataloader
progress_bar = tqdm(test_dataloader)
for batch in progress_bar:
    proper_items = batch["proper_items"]
    batch = {k: v.to(trainer3.device) for k, v in batch.items() if k != "proper_items"}

    items_scores = trainer3.model.predict_for_user(
        user_id=batch["user_id"][0],
        t=batch["t"],
        length=batch["length"],
        history_time=batch["history_time"],
    )

    items_scores = items_scores.view(-1, item_emb.shape[1])

    sims = []
    for i in range(len(item_emb)):
        similarity = cosine_similarity(item_emb[i].reshape(1, -1), items_scores.reshape(1, -1))[0][0]
        sims.append(similarity)

    sims = np.array(sims)
    top_items = np.array([sims.argsort()[::-1][:10]])

    for i in range(top_items.shape[0]):
        precision = get_precision(top_items[i], proper_items[i])
        recall = get_recall(top_items[i], proper_items[i])
        ndcg = get_ndcg(top_items[i], proper_items[i])

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)

In [None]:
precision = np.array(precisions).mean()
recall = np.array(recalls).mean()
ndcg = np.array(ndcgs).mean()

{"precision": precision, "recall": recall, "ndcg": ndcg}

In [42]:
def inference2(user, topk):
    n_items = corpus.n_items

    items_scores = trainer3.model.predict_for_user(user)
    items_scores = items_scores.view(-1, item_emb.shape[1])

    sims = []
    for i in range(len(item_emb)):
        similarity = cosine_similarity(item_emb[i].reshape(1, -1), items_scores.reshape(1, -1))[0][0]
        sims.append(similarity)

    sims = np.array(sims)
    top_items = np.array([sims.argsort()[::-1][:10]])
    top_items = torch.topk(items_scores, k=topk, dim=1, sorted=True).indices
    top_score = [float(i) for i in items_scores[0]]

    print(sorted(data['ship_to_id_emos'].unique())[user])
    print("Item  | Score")
    print('-------------')
    total = sum(items_scores[0])
    results = []
    for i in top_items[0]:
        if top_score[i] == 0:
            break
        item = data['prod_id'].unique()[int(i)]
        results.append(item)

        dataItem = dataEmb[dataEmb['prod_id'] == item]['desc']
        if len(dataItem) > 0:
            dataItem = dataItem.item()
        else:
            dataItem = 'None'    
        print(item)
#         print(dataItem)
#         print("{:5} | {:<5} | {}".format(item, round(top_score[i], 3), dataItem))
#     print()
    
    return results

top_k = 10
user_id = 73

res = inference2(user_id, top_k)

35522
Item  | Score
-------------
13526
11846
10831
13645
15085
11609
12793
12879
10239
10813


### Historical Data

In [None]:
dataReal = data[data['ship_to_id_emos'] == 35522]
# print(dataReal.tail(15))
print(sorted(list(dataReal['prod_id'].unique())))
# for prod in dataReal.tail(15)['prod_id']:
#     dataItem = dataEmb[dataEmb['prod_id'] == prod]['desc']
#     if len(dataItem) > 0:
#         dataItem = dataItem.item()
#     else:
#         dataItem = 'None'
# #     print(dataItem)
#     print(prod)

lst = list(data.groupby('ship_to_id_emos').count().sort_values('prod_id', ascending=False)[:4].index)
print(lst)

for l in lst:
    idx = np.where(data['ship_to_id_emos'].unique() == l)
    print(idx)