In [12]:
import sys
import os
import numpy as np
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
# from recommenders.models.newsrec.models.lstur import LSTURModel
from newsrec import LSTURModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 2.8.0


# Priprema parametara

In [13]:
epochs = 5
seed = 40
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

# Preuzimanje i priprema podataka

In [14]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'lstur.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 17.0k/17.0k [00:03<00:00, 5.28kKB/s]
100%|██████████| 9.84k/9.84k [00:03<00:00, 3.13kKB/s]
100%|██████████| 95.0k/95.0k [00:27<00:00, 3.49kKB/s]


# Priprema hiperparametara

In [15]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 100000, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'lstur', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'C:\\Users\\krick\\AppData\\Local\\Temp\\tmppntpy3_3\\utils\\embedding.npy', 'wordDict_file': 'C:\\Users\\krick\\AppData\\Local\\Temp\\tmppntpy3_3\\utils\\word_dict.pkl', 'userDict_file': 'C:\\Users\\krick\\AppData\\Local\\Temp\\tmppntpy3_3\\utils\\uid2index.pkl'}


In [16]:
iterator = MINDIterator

# Treniranje LSTUR modela

In [17]:
model = LSTURModel(hparams, iterator, seed=seed)

Tensor("conv1d_1/Relu:0", shape=(None, 30, 400), dtype=float32)
Tensor("att_layer2_1/Sum_1:0", shape=(None, 400), dtype=float32)


  super(Adam, self).__init__(name, **kwargs)


In [18]:
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
586it [00:05, 111.34it/s]
236it [01:46,  2.23it/s]
7538it [00:00, 12563.38it/s]


{'group_auc': 0.5201, 'mean_mrr': 0.2214, 'ndcg@5': 0.2292, 'ndcg@10': 0.2912}


In [19]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

1086it [39:13,  2.17s/it]
586it [00:07, 83.27it/s]
236it [01:55,  2.04it/s]
7538it [00:00, 10426.05it/s]


at epoch 1
train info: logloss loss:1.486980970942074
eval info: group_auc:0.5952, mean_mrr:0.2551, ndcg@10:0.3442, ndcg@5:0.2799
at epoch 1 , train time: 2353.1 eval time: 129.1


1086it [39:30,  2.18s/it]
586it [00:06, 84.43it/s]
236it [01:56,  2.03it/s]
7538it [00:00, 10064.14it/s]


at epoch 2
train info: logloss loss:1.406497059715826
eval info: group_auc:0.6143, mean_mrr:0.2735, ndcg@10:0.3635, ndcg@5:0.2986
at epoch 2 , train time: 2370.0 eval time: 129.5


1086it [37:30,  2.07s/it]
586it [00:06, 85.72it/s]
236it [01:50,  2.13it/s]
7538it [00:00, 10768.08it/s]


at epoch 3
train info: logloss loss:1.3586423224726654
eval info: group_auc:0.6257, mean_mrr:0.2849, ndcg@10:0.3765, ndcg@5:0.3135
at epoch 3 , train time: 2250.3 eval time: 124.0


1086it [38:12,  2.11s/it]
586it [00:06, 86.51it/s]
236it [01:45,  2.24it/s]
7538it [00:00, 11074.13it/s]


at epoch 4
train info: logloss loss:1.3229021605646194
eval info: group_auc:0.6249, mean_mrr:0.2792, ndcg@10:0.373, ndcg@5:0.3056
at epoch 4 , train time: 2292.2 eval time: 118.5


1086it [36:46,  2.03s/it]
586it [00:06, 87.69it/s]
236it [01:45,  2.23it/s]
7538it [00:00, 10908.94it/s]


at epoch 5
train info: logloss loss:1.2889748356728703
eval info: group_auc:0.642, mean_mrr:0.2943, ndcg@10:0.3878, ndcg@5:0.3237
at epoch 5 , train time: 2206.4 eval time: 118.7
Wall time: 3h 21min 31s


<newsrec.LSTURModel at 0x2145df73408>

In [20]:
model_path = os.path.join(data_path, "model")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "lstur_ckpt"))

In [21]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

586it [00:07, 81.24it/s]
236it [02:04,  1.90it/s]
7538it [00:00, 11723.24it/s]
