## Global settings and imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os
import abc
import time
import random
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
from tensorflow.compat.v1 import keras
from tensorflow.compat.v1.linalg import einsum
from tensorflow.compat.v1.keras import layers
from tensorflow.compat.v1.keras import backend as K


tf.get_logger().setLevel('ERROR') # only show error messages

from sklearn.metrics import (
    roc_auc_score,
    log_loss,
    mean_squared_error,
    accuracy_score,
    f1_score,
)

import yaml
import zipfile
import pickle as pkl


print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.15 (default, Oct 12 2022, 19:14:55) 
[GCC 7.5.0]
Tensorflow version: 2.9.2


## Utils

In [None]:
def flat_config(config):
    """Flat config loaded from a yaml file to a flat dict.
    Args:
        config (dict): Configuration loaded from a yaml file.
    Returns:
        dict: Configuration dictionary.
    """
    f_config = {}
    category = config.keys()
    for cate in category:
        for key, val in config[cate].items():
            f_config[key] = val
    return f_config

In [None]:
def check_type(config):
    """Check that the config parameters are the correct type
    Args:
        config (dict): Configuration dictionary.
    Raises:
        TypeError: If the parameters are not the correct type.
    """

    int_parameters = [
        "word_size",
        "his_size",
        "title_size",
        "body_size",
        "npratio",
        "word_emb_dim",
        "attention_hidden_dim",
        "epochs",
        "batch_size",
        "show_step",
        "save_epoch",
        "head_num",
        "head_dim",
        "user_num",
        "filter_num",
        "window_size",
        "gru_unit",
        "user_emb_dim",
        "vert_emb_dim",
        "subvert_emb_dim",
    ]
    for param in int_parameters:
        if param in config and not isinstance(config[param], int):
            raise TypeError("Parameters {0} must be int".format(param))

    float_parameters = ["learning_rate", "dropout"]
    for param in float_parameters:
        if param in config and not isinstance(config[param], float):
            raise TypeError("Parameters {0} must be float".format(param))

    str_parameters = [
        "wordEmb_file",
        "wordDict_file",
        "userDict_file",
        "vertDict_file",
        "subvertDict_file",
        "method",
        "loss",
        "optimizer",
        "cnn_activation",
        "dense_activation" "type",
    ]
    for param in str_parameters:
        if param in config and not isinstance(config[param], str):
            raise TypeError("Parameters {0} must be str".format(param))

    list_parameters = ["layer_sizes", "activation"]
    for param in list_parameters:
        if param in config and not isinstance(config[param], list):
            raise TypeError("Parameters {0} must be list".format(param))

    bool_parameters = ["support_quick_scoring"]
    for param in bool_parameters:
        if param in config and not isinstance(config[param], bool):
            raise TypeError("Parameters {0} must be bool".format(param))

In [None]:
def check_nn_config(f_config):
    """Check neural networks configuration.
    Args:
        f_config (dict): Neural network configuration.
    Raises:
        ValueError: If the parameters are not correct.
    """

    if f_config["model_type"] in ["nrms", "NRMS"]:
        required_parameters = [
            "title_size",
            "his_size",
            "wordEmb_file",
            "wordDict_file",
            "userDict_file",
            "npratio",
            "data_format",
            "word_emb_dim",
            # nrms
            "head_num",
            "head_dim",
            # attention
            "attention_hidden_dim",
            "loss",
            "data_format",
            "dropout",
        ]

    elif f_config["model_type"] in ["naml", "NAML"]:
        required_parameters = [
            "title_size",
            "body_size",
            "his_size",
            "wordEmb_file",
            "subvertDict_file",
            "vertDict_file",
            "wordDict_file",
            "userDict_file",
            "npratio",
            "data_format",
            "word_emb_dim",
            "vert_emb_dim",
            "subvert_emb_dim",
            # naml
            "filter_num",
            "cnn_activation",
            "window_size",
            "dense_activation",
            # attention
            "attention_hidden_dim",
            "loss",
            "data_format",
            "dropout",
        ]
    elif f_config["model_type"] in ["lstur", "LSTUR"]:
        required_parameters = [
            "title_size",
            "his_size",
            "wordEmb_file",
            "wordDict_file",
            "userDict_file",
            "npratio",
            "data_format",
            "word_emb_dim",
            # lstur
            "gru_unit",
            "type",
            "filter_num",
            "cnn_activation",
            "window_size",
            # attention
            "attention_hidden_dim",
            "loss",
            "data_format",
            "dropout",
        ]
    elif f_config["model_type"] in ["npa", "NPA"]:
        required_parameters = [
            "title_size",
            "his_size",
            "wordEmb_file",
            "wordDict_file",
            "userDict_file",
            "npratio",
            "data_format",
            "word_emb_dim",
            # npa
            "user_emb_dim",
            "filter_num",
            "cnn_activation",
            "window_size",
            # attention
            "attention_hidden_dim",
            "loss",
            "data_format",
            "dropout",
        ]
    else:
        required_parameters = []

    # check required parameters
    for param in required_parameters:
        if param not in f_config:
            raise ValueError("Parameters {0} must be set".format(param))

    if f_config["model_type"] in ["nrms", "NRMS", "lstur", "LSTUR"]:
        if f_config["data_format"] != "news":
            raise ValueError(
                "For nrms and naml model, data format must be 'news', but your set is {0}".format(
                    f_config["data_format"]
                )
            )
    elif f_config["model_type"] in ["naml", "NAML"]:
        if f_config["data_format"] != "naml":
            raise ValueError(
                "For nrms and naml model, data format must be 'naml', but your set is {0}".format(
                    f_config["data_format"]
                )
            )

    check_type(f_config)

In [None]:
def load_yaml(filename):
    """Load a yaml file.
    Args:
        filename (str): Filename.
    Returns:
        dict: Dictionary.
    """
    try:
        with open(filename, "r") as f:
            config = yaml.load(f, yaml.SafeLoader)
        return config
    except FileNotFoundError:  # for file not found
        raise
    except Exception:  # for other exceptions
        raise IOError("load {0} error!".format(filename))

In [None]:
class HParams:
    """Class for holding hyperparameters for DeepRec algorithms."""

    def __init__(self, hparams_dict):
        """Create an HParams object from a dictionary of hyperparameter values.
        Args:
            hparams_dict (dict): Dictionary with the model hyperparameters.
        """
        for val in hparams_dict.values():
            if not (
                isinstance(val, int)
                or isinstance(val, float)
                or isinstance(val, str)
                or isinstance(val, list)
            ):
                raise ValueError(
                    "Hyperparameter value {} should be integer, float, string or list.".format(
                        val
                    )
                )
        self._values = hparams_dict
        for hparam in hparams_dict:
            setattr(self, hparam, hparams_dict[hparam])

    def __repr__(self):
        return "HParams object with values {}".format(self._values.__repr__())

    def values(self):
        """Return the hyperparameter values as a dictionary.
        Returns:
            dict: Dictionary with the hyperparameter values.
        """
        return self._values

In [None]:
def create_hparams(flags):
    """Create the model hyperparameters.
    Args:
        flags (dict): Dictionary with the model requirements.
    Returns:
        HParams: Hyperparameter object.
    """
    init_dict = {
        # data
        "support_quick_scoring": False,
        # models
        "dropout": 0.0,
        "attention_hidden_dim": 200,
        # nrms
        "head_num": 4,
        "head_dim": 100,
        # naml
        "filter_num": 200,
        "window_size": 3,
        "vert_emb_dim": 100,
        "subvert_emb_dim": 100,
        # lstur
        "gru_unit": 400,
        "type": "ini",
        # npa
        "user_emb_dim": 50,
        # train
        "learning_rate": 0.001,
        "optimizer": "adam",
        "epochs": 10,
        "batch_size": 1,
        # show info
        "show_step": 1,
    }
    init_dict.update(flags)
    return HParams(init_dict)

In [None]:
def prepare_hparams(yaml_file=None, **kwargs):
    """Prepare the model hyperparameters and check that all have the correct value.
    Args:
        yaml_file (str): YAML file as configuration.
    Returns:
        HParams: Hyperparameter object.
    """
    if yaml_file is not None:
        config = load_yaml(yaml_file)
        config = flat_config(config)
    else:
        config = {}

    config.update(kwargs)

    check_nn_config(config)
    return create_hparams(config)

In [None]:
def word_tokenize(sent):
    """Split sentence into word list using regex.
    Args:
        sent (str): Input sentence
    Return:
        list: word list
    """
    pat = re.compile(r"[\w]+|[.,!?;|]")
    if isinstance(sent, str):
        return pat.findall(sent.lower())
    else:
        return []

In [None]:
def newsample(news, ratio):
    """Sample ratio samples from news list.
    If length of news is less than ratio, pad zeros.
    Args:
        news (list): input news list
        ratio (int): sample number
    Returns:
        list: output of sample list.
    """
    if ratio > len(news):
        return news + [0] * (ratio - len(news))
    else:
        return random.sample(news, ratio)

In [None]:
def mrr_score(y_true, y_score):
    """Computing mrr score metric.
    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.
    Returns:
        numpy.ndarray: mrr scores.
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)


def ndcg_score(y_true, y_score, k=10):
    """Computing ndcg score metric at k.
    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.
    Returns:
        numpy.ndarray: ndcg scores.
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def hit_score(y_true, y_score, k=10):
    """Computing hit score metric at k.
    Args:
        y_true (np.ndarray): ground-truth labels.
        y_score (np.ndarray): predicted labels.
    Returns:
        np.ndarray: hit score.
    """
    ground_truth = np.where(y_true == 1)[0]
    argsort = np.argsort(y_score)[::-1][:k]
    for idx in argsort:
        if idx in ground_truth:
            return 1
    return 0


def dcg_score(y_true, y_score, k=10):
    """Computing dcg score metric at k.
    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.
    Returns:
        np.ndarray: dcg scores.
    """
    k = min(np.shape(y_true)[-1], k)
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

In [None]:
def cal_metric(labels, preds, metrics):
    """Calculate metrics.
    Available options are: `auc`, `rmse`, `logloss`, `acc` (accurary), `f1`, `mean_mrr`,
    `ndcg` (format like: ndcg@2;4;6;8), `hit` (format like: hit@2;4;6;8), `group_auc`.
    Args:
        labels (array-like): Labels.
        preds (array-like): Predictions.
        metrics (list): List of metric names.
    Return:
        dict: Metrics.
    Examples:
        >>> cal_metric(labels, preds, ["ndcg@2;4;6", "group_auc"])
        {'ndcg@2': 0.4026, 'ndcg@4': 0.4953, 'ndcg@6': 0.5346, 'group_auc': 0.8096}
    """
    res = {}
    for metric in metrics:
        if metric == "auc":
            auc = roc_auc_score(np.asarray(labels), np.asarray(preds))
            res["auc"] = round(auc, 4)
        elif metric == "rmse":
            rmse = mean_squared_error(np.asarray(labels), np.asarray(preds))
            res["rmse"] = np.sqrt(round(rmse, 4))
        elif metric == "logloss":
            # avoid logloss nan
            preds = [max(min(p, 1.0 - 10e-12), 10e-12) for p in preds]
            logloss = log_loss(np.asarray(labels), np.asarray(preds))
            res["logloss"] = round(logloss, 4)
        elif metric == "acc":
            pred = np.asarray(preds)
            pred[pred >= 0.5] = 1
            pred[pred < 0.5] = 0
            acc = accuracy_score(np.asarray(labels), pred)
            res["acc"] = round(acc, 4)
        elif metric == "f1":
            pred = np.asarray(preds)
            pred[pred >= 0.5] = 1
            pred[pred < 0.5] = 0
            f1 = f1_score(np.asarray(labels), pred)
            res["f1"] = round(f1, 4)
        elif metric == "mean_mrr":
            mean_mrr = np.mean(
                [
                    mrr_score(each_labels, each_preds)
                    for each_labels, each_preds in zip(labels, preds)
                ]
            )
            res["mean_mrr"] = round(mean_mrr, 4)
        elif metric.startswith("ndcg"):  # format like:  ndcg@2;4;6;8
            ndcg_list = [1, 2]
            ks = metric.split("@")
            if len(ks) > 1:
                ndcg_list = [int(token) for token in ks[1].split(";")]
            for k in ndcg_list:
                ndcg_temp = np.mean(
                    [
                        ndcg_score(each_labels, each_preds, k)
                        for each_labels, each_preds in zip(labels, preds)
                    ]
                )
                res["ndcg@{0}".format(k)] = round(ndcg_temp, 4)
        elif metric.startswith("hit"):  # format like:  hit@2;4;6;8
            hit_list = [1, 2]
            ks = metric.split("@")
            if len(ks) > 1:
                hit_list = [int(token) for token in ks[1].split(";")]
            for k in hit_list:
                hit_temp = np.mean(
                    [
                        hit_score(each_labels, each_preds, k)
                        for each_labels, each_preds in zip(labels, preds)
                    ]
                )
                res["hit@{0}".format(k)] = round(hit_temp, 4)
        elif metric == "group_auc":
            group_auc = np.mean(
                [
                    roc_auc_score(each_labels, each_preds)
                    for each_labels, each_preds in zip(labels, preds)
                ]
            )
            res["group_auc"] = round(group_auc, 4)
        else:
            raise ValueError("Metric {0} not defined".format(metric))
    return res

## Prepare parameters

In [None]:
epochs = 10
seed = 42
batch_size = 32

data_path = '/content/drive/MyDrive/GH x RippleAI/Dataset/movielens/MINDLike'

## Load and Split Data

### Train / Valid Split

In [None]:
items_file = os.path.join(data_path,'items.tsv')
behaviors_file = os.path.join(data_path,'behaviors.tsv')

In [None]:
def train_valid_split(items_file, behaviors_file):
    items_df = pd.read_csv(items_file,sep='\t',header=None)
    behav_df = pd.read_csv(behaviors_file,sep='\t',header=None)
    
    
    def extract_items(x):
        items_list = x.split(' ')
        for i in range(len(items_list)):
            items_list[i] = items_list[i].split('-')[0]
        return items_list
    
    behav_df[4] = behav_df[3].apply(lambda x: extract_items(x))
    
    train_behav = behav_df.set_index(0).loc[:behav_df[0].nunique()*0.8].reset_index()
    valid_behav = behav_df.set_index(0).loc[behav_df[0].nunique()*0.8+1:].reset_index()
    
    
    if not os.path.exists(os.path.join(data_path,'train')):
        os.mkdir(os.path.join(data_path,'train'))
    
    if not os.path.exists(os.path.join(data_path,'valid')):
        os.mkdir(os.path.join(data_path,'valid'))
    
    
    train_items_set, valid_items_set = set(), set()
    

    if not os.path.exists(os.path.join(data_path,'train', r'items.tsv')):
        for i in train_behav[2].dropna().apply(lambda x: x.split(' ')):
            train_items_set.update(set(i))

        for i in train_behav[4]:
            train_items_set.update(set(i))

        items_df[4] = items_df[0].astype(str).apply(lambda x: x in train_items_set)
        items_df.drop(4,axis=1).to_csv(os.path.join(data_path,'train', r'items.tsv'),sep='\t',index=False, header=False)
    
    
    if not os.path.exists(os.path.join(data_path,'valid', r'items.tsv')):
        for i in valid_behav[2].dropna().apply(lambda x: x.split(' ')):
            valid_items_set.update(set(i))

        for i in valid_behav[4]:
            valid_items_set.update(set(i))

        items_df[4] = items_df[0].astype(str).apply(lambda x: x in valid_items_set)
        items_df.drop(4,axis=1).to_csv(os.path.join(data_path,'valid', r'items.tsv'),sep='\t',index=False, header=False)
    
    
    if not os.path.exists(os.path.join(data_path,'train', r'behaviors.tsv')):    
        train_behav.drop(4,axis=1).to_csv(os.path.join(data_path,'train', r'behaviors.tsv'),sep='\t',index=False, header=False)
        
    if not os.path.exists(os.path.join(data_path,'valid', r'behaviors.tsv')):   
        valid_behav.drop(4,axis=1).to_csv(os.path.join(data_path,'valid', r'behaviors.tsv'),sep='\t',index=False, header=False)

In [None]:
train_valid_split(items_file, behaviors_file)

### Load Data

In [None]:
train_items_file = os.path.join(data_path, 'train', r'items.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_items_file = os.path.join(data_path, 'valid', r'items.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

## Create hyper-parameters

In [None]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 10, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/content/drive/MyDrive/GH x RippleAI/Dataset/movielens/MINDLike/utils/embedding.npy', 'wordDict_file': '/content/drive/MyDrive/GH x RippleAI/Dataset/movielens/MINDLike/utils/word_dict.pkl', 'userDict_file': '/content/drive/MyDrive/GH x RippleAI/Dataset/movielens/MINDLike/utils/uid2index.pkl'}


## Model 정의

### Base Model

In [None]:
tf.compat.v1.disable_eager_execution()
tf.compat.v1.experimental.output_all_intermediates(True)

class BaseModel:
    """Basic class of models
    Attributes:
        hparams (HParams): A HParams object, holds the entire set of hyperparameters.
        train_iterator (object): An iterator to load the data in training steps.
        test_iterator (object): An iterator to load the data in testing steps.
        graph (object): An optional graph.
        seed (int): Random seed.
    """

    def __init__(
        self,
        hparams,
        iterator_creator,
        seed=None,
    ):
        """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
        parameter set.
        Args:
            hparams (HParams): A HParams object, holds the entire set of hyperparameters.
            iterator_creator (object): An iterator to load the data.
            graph (object): An optional graph.
            seed (int): Random seed.
        """
        self.seed = seed
        tf.compat.v1.set_random_seed(seed)
        np.random.seed(seed)

        self.train_iterator = iterator_creator(
            hparams,
            hparams.npratio,
            col_spliter="\t",
        )
        self.test_iterator = iterator_creator(
            hparams,
            col_spliter="\t",
        )

        self.hparams = hparams
        self.support_quick_scoring = hparams.support_quick_scoring

        # set GPU use with on demand growth
        gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
        sess = tf.compat.v1.Session(
            config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
        )

        # set this TensorFlow session as the default session for Keras
        tf.compat.v1.keras.backend.set_session(sess)

        # IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras!
        # Otherwise, their weights will be unavailable in the threads after the session there has been set
        self.model, self.scorer = self._build_graph()

        self.loss = self._get_loss()
        self.train_optimizer = self._get_opt()

        self.model.compile(loss=self.loss, optimizer=self.train_optimizer)

    def _init_embedding(self, file_path):
        """Load pre-trained embeddings as a constant tensor.
        Args:
            file_path (str): the pre-trained glove embeddings file path.
        Returns:
            numpy.ndarray: A constant numpy array.
        """

        return np.load(file_path)

    @abc.abstractmethod
    def _build_graph(self):
        """Subclass will implement this."""
        pass

    @abc.abstractmethod
    def _get_input_label_from_iter(self, batch_data):
        """Subclass will implement this"""
        pass

    def _get_loss(self):
        """Make loss function, consists of data loss and regularization loss
        Returns:
            object: Loss function or loss function name
        """
        if self.hparams.loss == "cross_entropy_loss":
            data_loss = "categorical_crossentropy"
        elif self.hparams.loss == "log_loss":
            data_loss = "binary_crossentropy"
        else:
            raise ValueError("this loss not defined {0}".format(self.hparams.loss))
        return data_loss

    def _get_opt(self):
        """Get the optimizer according to configuration. Usually we will use Adam.
        Returns:
            object: An optimizer.
        """
        lr = self.hparams.learning_rate
        optimizer = self.hparams.optimizer

        if optimizer == "adam":
            train_opt = keras.optimizers.Adam(lr=lr)

        return train_opt

    def _get_pred(self, logit, task):
        """Make final output as prediction score, according to different tasks.
        Args:
            logit (object): Base prediction value.
            task (str): A task (values: regression/classification)
        Returns:
            object: Transformed score
        """
        if task == "regression":
            pred = tf.identity(logit)
        elif task == "classification":
            pred = tf.sigmoid(logit)
        else:
            raise ValueError(
                "method must be regression or classification, but now is {0}".format(
                    task
                )
            )
        return pred

    def train(self, train_batch_data):
        """Go through the optimization step once with training data in feed_dict.
        Args:
            sess (object): The model session object.
            feed_dict (dict): Feed values to train the model. This is a dictionary that maps graph elements to values.
        Returns:
            list: A list of values, including update operation, total loss, data loss, and merged summary.
        """
        train_input, train_label = self._get_input_label_from_iter(train_batch_data)
        rslt = self.model.train_on_batch(train_input, train_label)
        return rslt

    def eval(self, eval_batch_data):
        """Evaluate the data in feed_dict with current model.
        Args:
            sess (object): The model session object.
            feed_dict (dict): Feed values for evaluation. This is a dictionary that maps graph elements to values.
        Returns:
            list: A list of evaluated results, including total loss value, data loss value, predicted scores, and ground-truth labels.
        """
        eval_input, eval_label = self._get_input_label_from_iter(eval_batch_data)
        imp_index = eval_batch_data["impression_index_batch"]

        pred_rslt = self.scorer.predict_on_batch(eval_input)

        return pred_rslt, eval_label, imp_index

    def fit(
        self,
        train_news_file,
        train_behaviors_file,
        valid_news_file,
        valid_behaviors_file,
        test_news_file=None,
        test_behaviors_file=None,
    ):
        """Fit the model with train_file. Evaluate the model on valid_file per epoch to observe the training status.
        If test_news_file is not None, evaluate it too.
        Args:
            train_file (str): training data set.
            valid_file (str): validation set.
            test_news_file (str): test set.
        Returns:
            object: An instance of self.
        """

        for epoch in range(1, self.hparams.epochs + 1):
            step = 0
            self.hparams.current_epoch = epoch
            epoch_loss = 0
            train_start = time.time()

            tqdm_util = tqdm(
                self.train_iterator.load_data_from_file(
                    train_news_file, train_behaviors_file
                )
            )

            for batch_data_input in tqdm_util:

                step_result = self.train(batch_data_input)
                step_data_loss = step_result

                epoch_loss += step_data_loss
                step += 1
                if step % self.hparams.show_step == 0:
                    tqdm_util.set_description(
                        "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
                            step, epoch_loss / step, step_data_loss
                        )
                    )

            train_end = time.time()
            train_time = train_end - train_start

            eval_start = time.time()

            train_info = ",".join(
                [
                    str(item[0]) + ":" + str(item[1])
                    for item in [("logloss loss", epoch_loss / step)]
                ]
            )

            eval_res = self.run_eval(valid_news_file, valid_behaviors_file)
            eval_info = ", ".join(
                [
                    str(item[0]) + ":" + str(item[1])
                    for item in sorted(eval_res.items(), key=lambda x: x[0])
                ]
            )
            if test_news_file is not None:
                test_res = self.run_eval(test_news_file, test_behaviors_file)
                test_info = ", ".join(
                    [
                        str(item[0]) + ":" + str(item[1])
                        for item in sorted(test_res.items(), key=lambda x: x[0])
                    ]
                )
            eval_end = time.time()
            eval_time = eval_end - eval_start

            if test_news_file is not None:
                print(
                    "at epoch {0:d}".format(epoch)
                    + "\ntrain info: "
                    + train_info
                    + "\neval info: "
                    + eval_info
                    + "\ntest info: "
                    + test_info
                )
            else:
                print(
                    "at epoch {0:d}".format(epoch)
                    + "\ntrain info: "
                    + train_info
                    + "\neval info: "
                    + eval_info
                )
            print(
                "at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}".format(
                    epoch, train_time, eval_time
                )
            )

        return self

    def group_labels(self, labels, preds, group_keys):
        """Devide labels and preds into several group according to values in group keys.
        Args:
            labels (list): ground truth label list.
            preds (list): prediction score list.
            group_keys (list): group key list.
        Returns:
            list, list, list:
            - Keys after group.
            - Labels after group.
            - Preds after group.
        """

        all_keys = list(set(group_keys))
        all_keys.sort()
        group_labels = {k: [] for k in all_keys}
        group_preds = {k: [] for k in all_keys}

        for label, p, k in zip(labels, preds, group_keys):
            group_labels[k].append(label)
            group_preds[k].append(p)

        all_labels = []
        all_preds = []
        for k in all_keys:
            all_labels.append(group_labels[k])
            all_preds.append(group_preds[k])

        return all_keys, all_labels, all_preds

    def run_eval(self, news_filename, behaviors_file):
        """Evaluate the given file and returns some evaluation metrics.
        Args:
            filename (str): A file name that will be evaluated.
        Returns:
            dict: A dictionary that contains evaluation metrics.
        """

        if self.support_quick_scoring:
            _, group_labels, group_preds = self.run_fast_eval(
                news_filename, behaviors_file
            )
        else:
            _, group_labels, group_preds = self.run_slow_eval(
                news_filename, behaviors_file
            )
        res = cal_metric(group_labels, group_preds, self.hparams.metrics)
        return res

    def user(self, batch_user_input):
        user_input = self._get_user_feature_from_iter(batch_user_input)
        user_vec = self.userencoder.predict_on_batch(user_input)
        user_index = batch_user_input["impr_index_batch"]

        return user_index, user_vec

    def news(self, batch_news_input):
        news_input = self._get_news_feature_from_iter(batch_news_input)
        news_vec = self.newsencoder.predict_on_batch(news_input)
        news_index = batch_news_input["news_index_batch"]

        return news_index, news_vec

    def run_user(self, news_filename, behaviors_file):
        if not hasattr(self, "userencoder"):
            raise ValueError("model must have attribute userencoder")

        user_indexes = []
        user_vecs = []
        for batch_data_input in tqdm(
            self.test_iterator.load_user_from_file(news_filename, behaviors_file)
        ):
            user_index, user_vec = self.user(batch_data_input)
            user_indexes.extend(np.reshape(user_index, -1))
            user_vecs.extend(user_vec)

        return dict(zip(user_indexes, user_vecs))

    def run_news(self, news_filename):
        if not hasattr(self, "newsencoder"):
            raise ValueError("model must have attribute newsencoder")

        news_indexes = []
        news_vecs = []
        for batch_data_input in tqdm(
            self.test_iterator.load_news_from_file(news_filename)
        ):
            news_index, news_vec = self.news(batch_data_input)
            news_indexes.extend(np.reshape(news_index, -1))
            news_vecs.extend(news_vec)

        return dict(zip(news_indexes, news_vecs))

    def run_slow_eval(self, news_filename, behaviors_file):
        preds = []
        labels = []
        imp_indexes = []

        for batch_data_input in tqdm(
            self.test_iterator.load_data_from_file(news_filename, behaviors_file)
        ):
            step_pred, step_labels, step_imp_index = self.eval(batch_data_input)
            preds.extend(np.reshape(step_pred, -1))
            labels.extend(np.reshape(step_labels, -1))
            imp_indexes.extend(np.reshape(step_imp_index, -1))

        group_impr_indexes, group_labels, group_preds = self.group_labels(
            labels, preds, imp_indexes
        )
        return group_impr_indexes, group_labels, group_preds

    def run_fast_eval(self, news_filename, behaviors_file):
        news_vecs = self.run_news(news_filename)
        user_vecs = self.run_user(news_filename, behaviors_file)

        self.news_vecs = news_vecs
        self.user_vecs = user_vecs

        group_impr_indexes = []
        group_labels = []
        group_preds = []

        for (
            impr_index,
            news_index,
            user_index,
            label,
        ) in tqdm(self.test_iterator.load_impression_from_file(behaviors_file)):
            pred = np.dot(
                np.stack([news_vecs[i] for i in news_index], axis=0),
                user_vecs[impr_index],
            )
            group_impr_indexes.append(impr_index)
            group_labels.append(label)
            group_preds.append(pred)

        return group_impr_indexes, group_labels, group_preds

### Layers

In [None]:
class AttLayer2(layers.Layer):
    """Soft alignment attention implement.
    Attributes:
        dim (int): attention hidden dim
    """

    def __init__(self, dim=200, seed=0, **kwargs):
        """Initialization steps for AttLayer2.
        Args:
            dim (int): attention hidden dim
        """

        self.dim = dim
        self.seed = seed
        super(AttLayer2, self).__init__(**kwargs)

    def build(self, input_shape):
        """Initialization for variables in AttLayer2
        There are there variables in AttLayer2, i.e. W, b and q.
        Args:
            input_shape (object): shape of input tensor.
        """

        assert len(input_shape) == 3
        dim = self.dim
        self.W = self.add_weight(
            name="W",
            shape=(int(input_shape[-1]), dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.b = self.add_weight(
            name="b",
            shape=(dim,),
            initializer=keras.initializers.Zeros(),
            trainable=True,
        )
        self.q = self.add_weight(
            name="q",
            shape=(dim, 1),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        super(AttLayer2, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, inputs, mask=None, **kwargs):
        """Core implemention of soft attention
        Args:
            inputs (object): input tensor.
        Returns:
            object: weighted sum of input tensors.
        """

        attention = K.tanh(K.dot(inputs, self.W) + self.b)
        attention = K.dot(attention, self.q)

        attention = K.squeeze(attention, axis=2)

        if mask is None:
            attention = K.exp(attention)
        else:
            attention = K.exp(attention) * K.cast(mask, dtype="float32")

        attention_weight = attention / (
            K.sum(attention, axis=-1, keepdims=True) + K.epsilon()
        )

        attention_weight = K.expand_dims(attention_weight)
        weighted_input = inputs * attention_weight
        return K.sum(weighted_input, axis=1)

    def compute_mask(self, input, input_mask=None):
        """Compte output mask value
        Args:
            input (object): input tensor.
            input_mask: input mask
        Returns:
            object: output mask.
        """
        return None

    def compute_output_shape(self, input_shape):
        """Compute shape of output tensor
        Args:
            input_shape (tuple): shape of input tensor.
        Returns:
            tuple: shape of output tensor.
        """
        return input_shape[0], input_shape[-1]

In [None]:
class SelfAttention(layers.Layer):
    """Multi-head self attention implement.
    Args:
        multiheads (int): The number of heads.
        head_dim (object): Dimention of each head.
        mask_right (boolean): whether to mask right words.
    Returns:
        object: Weighted sum after attention.
    """

    def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs):
        """Initialization steps for AttLayer2.
        Args:
            multiheads (int): The number of heads.
            head_dim (object): Dimension of each head.
            mask_right (boolean): Whether to mask right words.
        """

        self.multiheads = multiheads
        self.head_dim = head_dim
        self.output_dim = multiheads * head_dim
        self.mask_right = mask_right
        self.seed = seed
        super(SelfAttention, self).__init__(**kwargs)

    def compute_output_shape(self, input_shape):
        """Compute shape of output tensor.
        Returns:
            tuple: output shape tuple.
        """

        return (input_shape[0][0], input_shape[0][1], self.output_dim)

    def build(self, input_shape):
        """Initialization for variables in SelfAttention.
        There are three variables in SelfAttention, i.e. WQ, WK ans WV.
        WQ is used for linear transformation of query.
        WK is used for linear transformation of key.
        WV is used for linear transformation of value.
        Args:
            input_shape (object): shape of input tensor.
        """

        self.WQ = self.add_weight(
            name="WQ",
            shape=(int(input_shape[0][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.WK = self.add_weight(
            name="WK",
            shape=(int(input_shape[1][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.WV = self.add_weight(
            name="WV",
            shape=(int(input_shape[2][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        super(SelfAttention, self).build(input_shape)

    def Mask(self, inputs, seq_len, mode="add"):
        """Mask operation used in multi-head self attention
        Args:
            seq_len (object): sequence length of inputs.
            mode (str): mode of mask.
        Returns:
            object: tensors after masking.
        """

        if seq_len is None:
            return inputs
        else:
            mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, axis=1)

            for _ in range(len(inputs.shape) - 2):
                mask = K.expand_dims(mask, 2)

            if mode == "mul":
                return inputs * mask
            elif mode == "add":
                return inputs - (1 - mask) * 1e12

    def call(self, QKVs):
        """Core logic of multi-head self attention.
        Args:
            QKVs (list): inputs of multi-head self attention i.e. query, key and value.
        Returns:
            object: ouput tensors.
        """
        if len(QKVs) == 3:
            Q_seq, K_seq, V_seq = QKVs
            Q_len, V_len = None, None
        elif len(QKVs) == 5:
            Q_seq, K_seq, V_seq, Q_len, V_len = QKVs
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(
            Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)
        )
        Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))

        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(
            K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)
        )
        K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))

        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(
            V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)
        )
        V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))

        A = einsum("abij, abkj -> abik", Q_seq, K_seq) / K.sqrt(
            K.cast(self.head_dim, dtype="float32")
        )
        A = K.permute_dimensions(
            A, pattern=(0, 3, 2, 1)
        )  # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]

        A = self.Mask(A, V_len, "add")
        A = K.permute_dimensions(A, pattern=(0, 3, 2, 1))

        if self.mask_right:
            ones = K.ones_like(A[:1, :1])
            lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
            mask = (ones - lower_triangular) * 1e12
            A = A - mask
        A = K.softmax(A)

        O_seq = einsum("abij, abjk -> abik", A, V_seq)
        O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))

        O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, "mul")
        return O_seq

    def get_config(self):
        """add multiheads, multiheads and mask_right into layer config.
        Returns:
            dict: config of SelfAttention layer.
        """
        config = super(SelfAttention, self).get_config()
        config.update(
            {
                "multiheads": self.multiheads,
                "head_dim": self.head_dim,
                "mask_right": self.mask_right,
            }
        )
        return config

### NRMS

In [None]:
__all__ = ["NRMSModel"]


class NRMSModel(BaseModel):
    """NRMS model(Neural News Recommendation with Multi-Head Self-Attention)
    Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
    Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
    on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
    on Natural Language Processing (EMNLP-IJCNLP)
    Attributes:
        word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
        hparam (object): Global hyper-parameters.
    """

    def __init__(
        self,
        hparams,
        iterator_creator,
        seed=None,
    ):
        """Initialization steps for NRMS.
        Compared with the BaseModel, NRMS need word embedding.
        After creating word embedding matrix, BaseModel's __init__ method will be called.
        Args:
            hparams (object): Global hyper-parameters. Some key setttings such as head_num and head_dim are there.
            iterator_creator_train (object): NRMS data loader class for train data.
            iterator_creator_test (object): NRMS data loader class for test and validation data
        """
        self.word2vec_embedding = self._init_embedding(hparams.wordEmb_file)

        super().__init__(
            hparams,
            iterator_creator,
            seed=seed,
        )

    def _get_input_label_from_iter(self, batch_data):
        """get input and labels for trainning from iterator
        Args:
            batch data: input batch data from iterator
        Returns:
            list: input feature fed into model (clicked_title_batch & candidate_title_batch)
            numpy.ndarray: labels
        """
        input_feat = [
            batch_data["clicked_title_batch"],
            batch_data["candidate_title_batch"],
        ]
        input_label = batch_data["labels"]
        return input_feat, input_label

    def _get_user_feature_from_iter(self, batch_data):
        """get input of user encoder
        Args:
            batch_data: input batch data from user iterator
        Returns:
            numpy.ndarray: input user feature (clicked title batch)
        """
        return batch_data["clicked_title_batch"]

    def _get_news_feature_from_iter(self, batch_data):
        """get input of news encoder
        Args:
            batch_data: input batch data from news iterator
        Returns:
            numpy.ndarray: input news feature (candidate title batch)
        """
        return batch_data["candidate_title_batch"]

    def _build_graph(self):
        """Build NRMS model and scorer.
        Returns:
            object: a model used to train.
            object: a model used to evaluate and inference.
        """
        model, scorer = self._build_nrms()
        return model, scorer

    def _build_userencoder(self, titleencoder):
        """The main function to create user encoder of NRMS.
        Args:
            titleencoder (object): the news encoder of NRMS.
        Return:
            object: the user encoder of NRMS.
        """
        hparams = self.hparams
        his_input_title = keras.Input(
            shape=(hparams.his_size, hparams.title_size), dtype="int32"
        )

        click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title)
        y = SelfAttention(hparams.head_num, hparams.head_dim, seed=self.seed)(
            [click_title_presents] * 3
        )
        user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)

        model = keras.Model(his_input_title, user_present, name="user_encoder")
        return model

    def _build_newsencoder(self, embedding_layer):
        """The main function to create news encoder of NRMS.
        Args:
            embedding_layer (object): a word embedding layer.
        Return:
            object: the news encoder of NRMS.
        """
        hparams = self.hparams
        sequences_input_title = keras.Input(shape=(hparams.title_size,), dtype="int32")

        embedded_sequences_title = embedding_layer(sequences_input_title)

        y = layers.Dropout(hparams.dropout)(embedded_sequences_title)
        y = SelfAttention(hparams.head_num, hparams.head_dim, seed=self.seed)([y, y, y])
        y = layers.Dropout(hparams.dropout)(y)
        pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)

        model = keras.Model(sequences_input_title, pred_title, name="news_encoder")
        return model

    def _build_nrms(self):
        """The main function to create NRMS's logic. The core of NRMS
        is a user encoder and a news encoder.
        Returns:
            object: a model used to train.
            object: a model used to evaluate and inference.
        """
        hparams = self.hparams

        his_input_title = keras.Input(
            shape=(hparams.his_size, hparams.title_size), dtype="int32"
        )
        pred_input_title = keras.Input(
            shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
        )
        pred_input_title_one = keras.Input(
            shape=(
                1,
                hparams.title_size,
            ),
            dtype="int32",
        )
        pred_title_one_reshape = layers.Reshape((hparams.title_size,))(
            pred_input_title_one
        )

        embedding_layer = layers.Embedding(
            self.word2vec_embedding.shape[0],
            hparams.word_emb_dim,
            weights=[self.word2vec_embedding],
            trainable=True,
        )

        titleencoder = self._build_newsencoder(embedding_layer)
        self.userencoder = self._build_userencoder(titleencoder)
        self.newsencoder = titleencoder

        user_present = self.userencoder(his_input_title)
        news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title)
        news_present_one = self.newsencoder(pred_title_one_reshape)

        preds = layers.Dot(axes=-1)([news_present, user_present])
        preds = layers.Activation(activation="softmax")(preds)

        pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
        pred_one = layers.Activation(activation="sigmoid")(pred_one)

        model = keras.Model([his_input_title, pred_input_title], preds)
        scorer = keras.Model([his_input_title, pred_input_title_one], pred_one)

        return model, scorer

### Iterator

In [None]:
class BaseIterator(object):
    """Abstract base iterator class"""

    @abc.abstractmethod
    def parser_one_line(self, line):
        """Abstract method. Parse one string line into feature values.
        Args:
            line (str): A string indicating one instance.
        """
        pass

    @abc.abstractmethod
    def load_data_from_file(self, infile):
        """Abstract method. Read and parse data from a file.
        Args:
            infile (str): Text input file. Each line in this file is an instance.
        """
        pass

    @abc.abstractmethod
    def _convert_data(self, labels, features):
        pass

    @abc.abstractmethod
    def gen_feed_dict(self, data_dict):
        """Abstract method. Construct a dictionary that maps graph elements to values.
        Args:
            data_dict (dict): A dictionary that maps string name to numpy arrays.
        """
        pass

In [None]:
class MINDIterator(BaseIterator):
    """Train data loader for NAML model.
    The model require a special type of data format, where each instance contains a label, impresion id, user id,
    the candidate news articles and user's clicked news article. Articles are represented by title words,
    body words, verts and subverts.
    Iterator will not load the whole data into memory. Instead, it loads data into memory
    per mini-batch, so that large files can be used as input data.
    Attributes:
        col_spliter (str): column spliter in one line.
        ID_spliter (str): ID spliter in one line.
        batch_size (int): the samples num in one batch.
        title_size (int): max word num in news title.
        his_size (int): max clicked news num in user click history.
        npratio (int): negaive and positive ratio used in negative sampling. -1 means no need of negtive sampling.
    """

    def __init__(
        self,
        hparams,
        npratio=-1,
        col_spliter="\t",
        ID_spliter="%",
    ):
        """Initialize an iterator. Create necessary placeholders for the model.
        Args:
            hparams (object): Global hyper-parameters. Some key setttings such as head_num and head_dim are there.
            npratio (int): negaive and positive ratio used in negative sampling. -1 means no need of negtive sampling.
            col_spliter (str): column spliter in one line.
            ID_spliter (str): ID spliter in one line.
        """
        self.col_spliter = col_spliter
        self.ID_spliter = ID_spliter
        self.batch_size = hparams.batch_size
        self.title_size = hparams.title_size
        self.his_size = hparams.his_size
        self.npratio = npratio

        self.word_dict = self.load_dict(hparams.wordDict_file)
        self.uid2index = self.load_dict(hparams.userDict_file)

    def load_dict(self, file_path):
        """load pickle file
        Args:
            file path (str): file path
        Returns:
            object: pickle loaded object
        """
        with open(file_path, "rb") as f:
            return pkl.load(f)

    def init_news(self, news_file):
        """init news information given news file, such as news_title_index and nid2index.
        Args:
            news_file: path of news file
        """

        self.nid2index = {}
        news_title = [""]

        with tf.io.gfile.GFile(news_file, "r") as rd:
            for line in rd:
                nid, vert, subvert, title = line.strip("\n").split(
                    self.col_spliter
                )

                if nid in self.nid2index:
                    continue

                self.nid2index[nid] = len(self.nid2index) + 1
                title = word_tokenize(title)
                news_title.append(title)

        self.news_title_index = np.zeros(
            (len(news_title), self.title_size), dtype="int32"
        )

        for news_index in range(len(news_title)):
            title = news_title[news_index]
            for word_index in range(min(self.title_size, len(title))):
                if title[word_index] in self.word_dict:
                    self.news_title_index[news_index, word_index] = self.word_dict[
                        title[word_index].lower()
                    ]

    def init_behaviors(self, behaviors_file):
        """init behavior logs given behaviors file.
        Args:
        behaviors_file: path of behaviors file
        """
        self.histories = []
        self.imprs = []
        self.labels = []
        self.impr_indexes = []
        self.uindexes = []

        with tf.io.gfile.GFile(behaviors_file, "r") as rd:
            impr_index = 0
            for line in rd:
                uid, time, history, impr = line.strip("\n").split(self.col_spliter)[-4:]

                history = [self.nid2index[i] for i in history.split()]
                history = [0] * (self.his_size - len(history)) + history[
                    : self.his_size
                ]

                impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                label = [int(i.split("-")[1]) for i in impr.split()]
                uindex = self.uid2index[uid] if uid in self.uid2index else 0

                self.histories.append(history)
                self.imprs.append(impr_news)
                self.labels.append(label)
                self.impr_indexes.append(impr_index)
                self.uindexes.append(uindex)
                impr_index += 1

    def parser_one_line(self, line):
        """Parse one behavior sample into feature values.
        if npratio is larger than 0, return negtive sampled result.
        Args:
            line (int): sample index.
        Yields:
            list: Parsed results including label, impression id , user id,
            candidate_title_index, clicked_title_index.
        """
        if self.npratio > 0:
            impr_label = self.labels[line]
            impr = self.imprs[line]

            poss = []
            negs = []

            for news, click in zip(impr, impr_label):
                if click == 1:
                    poss.append(news)
                else:
                    negs.append(news)

            for p in poss:
                candidate_title_index = []
                impr_index = []
                user_index = []
                label = [1] + [0] * self.npratio

                n = newsample(negs, self.npratio)
                candidate_title_index = self.news_title_index[[p] + n]
                click_title_index = self.news_title_index[self.histories[line]]
                impr_index.append(self.impr_indexes[line])
                user_index.append(self.uindexes[line])

                yield (
                    label,
                    impr_index,
                    user_index,
                    candidate_title_index,
                    click_title_index,
                )

        else:
            impr_label = self.labels[line]
            impr = self.imprs[line]

            for news, label in zip(impr, impr_label):
                candidate_title_index = []
                impr_index = []
                user_index = []
                label = [label]

                candidate_title_index.append(self.news_title_index[news])
                click_title_index = self.news_title_index[self.histories[line]]
                impr_index.append(self.impr_indexes[line])
                user_index.append(self.uindexes[line])

                yield (
                    label,
                    impr_index,
                    user_index,
                    candidate_title_index,
                    click_title_index,
                )

    def load_data_from_file(self, news_file, behavior_file):
        """Read and parse data from news file and behavior file.
        Args:
            news_file (str): A file contains several informations of news.
            beahaviros_file (str): A file contains information of user impressions.
        Yields:
            object: An iterator that yields parsed results, in the format of dict.
        """

        if not hasattr(self, "news_title_index"):
            self.init_news(news_file)

        if not hasattr(self, "impr_indexes"):
            self.init_behaviors(behavior_file)

        label_list = []
        imp_indexes = []
        user_indexes = []
        candidate_title_indexes = []
        click_title_indexes = []
        cnt = 0

        indexes = np.arange(len(self.labels))

        if self.npratio > 0:
            np.random.shuffle(indexes)

        for index in indexes:
            for (
                label,
                imp_index,
                user_index,
                candidate_title_index,
                click_title_index,
            ) in self.parser_one_line(index):
                candidate_title_indexes.append(candidate_title_index)
                click_title_indexes.append(click_title_index)
                imp_indexes.append(imp_index)
                user_indexes.append(user_index)
                label_list.append(label)

                cnt += 1
                if cnt >= self.batch_size:
                    yield self._convert_data(
                        label_list,
                        imp_indexes,
                        user_indexes,
                        candidate_title_indexes,
                        click_title_indexes,
                    )
                    label_list = []
                    imp_indexes = []
                    user_indexes = []
                    candidate_title_indexes = []
                    click_title_indexes = []
                    cnt = 0

        if cnt > 0:
            yield self._convert_data(
                label_list,
                imp_indexes,
                user_indexes,
                candidate_title_indexes,
                click_title_indexes,
            )

    def _convert_data(
        self,
        label_list,
        imp_indexes,
        user_indexes,
        candidate_title_indexes,
        click_title_indexes,
    ):
        """Convert data into numpy arrays that are good for further model operation.
        Args:
            label_list (list): a list of ground-truth labels.
            imp_indexes (list): a list of impression indexes.
            user_indexes (list): a list of user indexes.
            candidate_title_indexes (list): the candidate news titles' words indices.
            click_title_indexes (list): words indices for user's clicked news titles.
        Returns:
            dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
        """

        labels = np.asarray(label_list, dtype=np.float32)
        imp_indexes = np.asarray(imp_indexes, dtype=np.int32)
        user_indexes = np.asarray(user_indexes, dtype=np.int32)
        candidate_title_index_batch = np.asarray(
            candidate_title_indexes, dtype=np.int64
        )
        click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)
        return {
            "impression_index_batch": imp_indexes,
            "user_index_batch": user_indexes,
            "clicked_title_batch": click_title_index_batch,
            "candidate_title_batch": candidate_title_index_batch,
            "labels": labels,
        }

    def load_user_from_file(self, news_file, behavior_file):
        """Read and parse user data from news file and behavior file.
        Args:
            news_file (str): A file contains several informations of news.
            beahaviros_file (str): A file contains information of user impressions.
        Yields:
            object: An iterator that yields parsed user feature, in the format of dict.
        """

        if not hasattr(self, "news_title_index"):
            self.init_news(news_file)

        if not hasattr(self, "impr_indexes"):
            self.init_behaviors(behavior_file)

        user_indexes = []
        impr_indexes = []
        click_title_indexes = []
        cnt = 0

        for index in range(len(self.impr_indexes)):
            click_title_indexes.append(self.news_title_index[self.histories[index]])
            user_indexes.append(self.uindexes[index])
            impr_indexes.append(self.impr_indexes[index])

            cnt += 1
            if cnt >= self.batch_size:
                yield self._convert_user_data(
                    user_indexes,
                    impr_indexes,
                    click_title_indexes,
                )
                user_indexes = []
                impr_indexes = []
                click_title_indexes = []
                cnt = 0

        if cnt > 0:
            yield self._convert_user_data(
                user_indexes,
                impr_indexes,
                click_title_indexes,
            )

    def _convert_user_data(
        self,
        user_indexes,
        impr_indexes,
        click_title_indexes,
    ):
        """Convert data into numpy arrays that are good for further model operation.
        Args:
            user_indexes (list): a list of user indexes.
            click_title_indexes (list): words indices for user's clicked news titles.
        Returns:
            dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
        """

        user_indexes = np.asarray(user_indexes, dtype=np.int32)
        impr_indexes = np.asarray(impr_indexes, dtype=np.int32)
        click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)

        return {
            "user_index_batch": user_indexes,
            "impr_index_batch": impr_indexes,
            "clicked_title_batch": click_title_index_batch,
        }

    def load_news_from_file(self, news_file):
        """Read and parse user data from news file.
        Args:
            news_file (str): A file contains several informations of news.
        Yields:
            object: An iterator that yields parsed news feature, in the format of dict.
        """
        if not hasattr(self, "news_title_index"):
            self.init_news(news_file)

        news_indexes = []
        candidate_title_indexes = []
        cnt = 0

        for index in range(len(self.news_title_index)):
            news_indexes.append(index)
            candidate_title_indexes.append(self.news_title_index[index])

            cnt += 1
            if cnt >= self.batch_size:
                yield self._convert_news_data(
                    news_indexes,
                    candidate_title_indexes,
                )
                news_indexes = []
                candidate_title_indexes = []
                cnt = 0

        if cnt > 0:
            yield self._convert_news_data(
                news_indexes,
                candidate_title_indexes,
            )

    def _convert_news_data(
        self,
        news_indexes,
        candidate_title_indexes,
    ):
        """Convert data into numpy arrays that are good for further model operation.
        Args:
            news_indexes (list): a list of news indexes.
            candidate_title_indexes (list): the candidate news titles' words indices.
        Returns:
            dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
        """

        news_indexes_batch = np.asarray(news_indexes, dtype=np.int32)
        candidate_title_index_batch = np.asarray(
            candidate_title_indexes, dtype=np.int32
        )

        return {
            "news_index_batch": news_indexes_batch,
            "candidate_title_batch": candidate_title_index_batch,
        }

    def load_impression_from_file(self, behaivors_file):
        """Read and parse impression data from behaivors file.
        Args:
            behaivors_file (str): A file contains several informations of behaviros.
        Yields:
            object: An iterator that yields parsed impression data, in the format of dict.
        """

        if not hasattr(self, "histories"):
            self.init_behaviors(behaivors_file)

        indexes = np.arange(len(self.labels))

        for index in indexes:
            impr_label = np.array(self.labels[index], dtype="int32")
            impr_news = np.array(self.imprs[index], dtype="int32")

            yield (
                self.impr_indexes[index],
                impr_news,
                self.uindexes[index],
                impr_label,
            )

In [None]:
# Model init

model = NRMSModel(hparams, MINDIterator, seed=seed)

  super(Adam, self).__init__(name, **kwargs)


## Train

In [None]:
print(model.run_eval(valid_items_file, valid_behaviors_file))

  updates=self.state_updates,
305it [00:03, 97.95it/s] 
699it [00:23, 29.58it/s]
22365it [00:01, 21730.98it/s]


{'group_auc': 0.4577, 'mean_mrr': 0.1762, 'ndcg@5': 0.1373, 'ndcg@10': 0.1465}


In [None]:
%%time
model.fit(train_items_file, train_behaviors_file, valid_items_file, valid_behaviors_file)

step 2450 , total_loss: 1.3350, data_loss: 1.2976: : 2453it [06:37,  6.17it/s]
305it [00:00, 704.47it/s]
699it [00:23, 30.37it/s]
22365it [00:01, 21398.57it/s]


at epoch 1
train info: logloss loss:1.3350297638556445
eval info: group_auc:0.6644, mean_mrr:0.3592, ndcg@10:0.3512, ndcg@5:0.35
at epoch 1 , train time: 397.6 eval time: 43.5


step 2450 , total_loss: 1.2690, data_loss: 1.2711: : 2453it [06:29,  6.30it/s]
305it [00:00, 690.32it/s]
699it [00:22, 30.39it/s]
22365it [00:01, 20910.29it/s]


at epoch 2
train info: logloss loss:1.2690721255295432
eval info: group_auc:0.6681, mean_mrr:0.3684, ndcg@10:0.358, ndcg@5:0.3567
at epoch 2 , train time: 389.4 eval time: 43.2


step 2450 , total_loss: 1.2453, data_loss: 1.2538: : 2453it [06:29,  6.30it/s]
305it [00:00, 699.06it/s]
699it [00:23, 30.35it/s]
22365it [00:01, 21778.38it/s]


at epoch 3
train info: logloss loss:1.2452655574811802
eval info: group_auc:0.6705, mean_mrr:0.3739, ndcg@10:0.3625, ndcg@5:0.3611
at epoch 3 , train time: 389.6 eval time: 43.3


step 2450 , total_loss: 1.2332, data_loss: 1.2333: : 2453it [06:28,  6.31it/s]
305it [00:00, 728.24it/s]
699it [00:22, 30.51it/s]
22365it [00:01, 21900.11it/s]


at epoch 4
train info: logloss loss:1.2331620984985252
eval info: group_auc:0.6726, mean_mrr:0.3779, ndcg@10:0.3652, ndcg@5:0.3637
at epoch 4 , train time: 389.0 eval time: 42.9


step 2450 , total_loss: 1.2182, data_loss: 1.1276: : 2453it [06:28,  6.32it/s]
305it [00:00, 684.34it/s]
699it [00:22, 30.50it/s]
22365it [00:01, 21491.37it/s]


at epoch 5
train info: logloss loss:1.2181353556395451
eval info: group_auc:0.6754, mean_mrr:0.3799, ndcg@10:0.3666, ndcg@5:0.3642
at epoch 5 , train time: 388.3 eval time: 43.2


step 2450 , total_loss: 1.2064, data_loss: 1.1869: : 2453it [06:27,  6.33it/s]
305it [00:00, 732.31it/s]
699it [00:22, 30.46it/s]
22365it [00:00, 22536.07it/s]


at epoch 6
train info: logloss loss:1.2061731059649405
eval info: group_auc:0.6776, mean_mrr:0.3832, ndcg@10:0.3695, ndcg@5:0.3671
at epoch 6 , train time: 387.5 eval time: 43.1


step 2450 , total_loss: 1.1943, data_loss: 1.2410: : 2453it [06:27,  6.32it/s]
305it [00:00, 703.52it/s]
699it [00:22, 30.43it/s]
22365it [00:00, 22605.47it/s]


at epoch 7
train info: logloss loss:1.1945748708221895
eval info: group_auc:0.6791, mean_mrr:0.3837, ndcg@10:0.37, ndcg@5:0.3667
at epoch 7 , train time: 387.9 eval time: 42.9


step 2450 , total_loss: 1.1859, data_loss: 1.2450: : 2453it [06:27,  6.33it/s]
305it [00:00, 705.81it/s]
699it [00:22, 30.49it/s]
22365it [00:01, 20996.87it/s]


at epoch 8
train info: logloss loss:1.1858221716750557
eval info: group_auc:0.6807, mean_mrr:0.3842, ndcg@10:0.3709, ndcg@5:0.3677
at epoch 8 , train time: 387.4 eval time: 43.2


step 2450 , total_loss: 1.1801, data_loss: 1.1520: : 2453it [06:27,  6.32it/s]
305it [00:00, 713.57it/s]
699it [00:23, 30.38it/s]
22365it [00:00, 22465.28it/s]


at epoch 9
train info: logloss loss:1.1798990880330145
eval info: group_auc:0.68, mean_mrr:0.3864, ndcg@10:0.3729, ndcg@5:0.3701
at epoch 9 , train time: 387.9 eval time: 43.2


step 2450 , total_loss: 1.1737, data_loss: 1.0101: : 2453it [06:28,  6.32it/s]
305it [00:00, 709.05it/s]
699it [00:22, 30.43it/s]
22365it [00:00, 22475.40it/s]


at epoch 10
train info: logloss loss:1.173926398844511
eval info: group_auc:0.6808, mean_mrr:0.3861, ndcg@10:0.3723, ndcg@5:0.3684
at epoch 10 , train time: 388.2 eval time: 43.0
CPU times: user 36min 47s, sys: 2min 1s, total: 38min 48s
Wall time: 1h 12min 4s


<__main__.NRMSModel at 0x7f0060dfadd0>

In [None]:
%%time
res_syn = model.run_eval(valid_items_file, valid_behaviors_file)
print(res_syn)

305it [00:00, 551.15it/s]
699it [00:20, 33.41it/s]
22365it [00:00, 22945.51it/s]


{'group_auc': 0.6808, 'mean_mrr': 0.3861, 'ndcg@5': 0.3684, 'ndcg@10': 0.3723}
CPU times: user 42 s, sys: 845 ms, total: 42.8 s
Wall time: 40.8 s


## Prediction

### Calculation

In [None]:
class NewsRecCal:
    def __init__(self, model, hparams, news_file, behaviors_file, k=18720):
        """Initialize an iterator. Create necessary placeholders for the model.
        Args:
            model (object): Pre-trained NewsRec Model.
            hparams (object) : Pre-defined hyperparameters & configurations
            news_file (file directory + name): News tsv file of the MIND dataset format.
            behaviors_file (file directory + name): Behavior tsv file of the MIND dataset format.
            k (int): Number of maxiumum rank to recommend
        """
        self.model = model
        self.hparams = hparams
        self.news_file = news_file
        self.behaviors_file = behaviors_file
        self.news = pd.read_csv(self.news_file, sep="\t",header=None)
        self.behav = pd.read_csv(self.behaviors_file, sep="\t",header=None)
        self.k = k
        
        # 각 번호에 해당하는 뉴스가 어떤 뉴스인지 복원 위해 필요
        self.test_iterator = MINDIterator(hparams, col_spliter="\t")
        self.test_iterator.init_news(self.news_file)
    
    
        # 다음으로, news_vecs와 user_vecs를 각각 model.run_news()와 model.run_user()로부터 가져옴
        self.news_vecs = self.model.run_news(self.news_file)
        self.user_vecs = self.model.run_user(self.news_file,self.behaviors_file)
    
    
    def preprocess(self):        
        """
        user_vecs의 길이가 user의 unique 수가 아니라, impression의 수이고, 
        따라서 user_vecs 내에 중복 벡터들이 있으므로, unique한 user벡터만 남기는 작업부터 해 주겠음!
        """
        # unique한 user벡터를 user_vecs_arr로 저장
        duplicate_dropped_idx = np.array(pd.DataFrame(self.behav[0]).drop_duplicates().index)
        user_vecs_arr = pd.DataFrame(self.user_vecs)[duplicate_dropped_idx].transpose().values
    
    
        # news_vecs 역시 dictionary에서 array로 변환
        news_vecs_arr = pd.DataFrame(self.news_vecs).transpose().values
        
        
        # garbage value인 news_vec의 0번째 element와의 내적 결과 제거하고 dataframe 구성 위해 필요
        user_keys = self.behav.loc[duplicate_dropped_idx][0].to_list()
        news_keys = list(self.test_iterator.nid2index.keys())
        
        return user_vecs_arr, news_vecs_arr, user_keys, news_keys

    
    def score(self):
        self.user_vecs_arr, self.news_vecs_arr, self.user_keys, self.news_keys = self.preprocess()
        
        # 각 unique user별 각 news와의 내적 값을 행렬곱 형태로 계산
        user_news_score = np.matmul(self.user_vecs_arr,self.news_vecs_arr.transpose())

        score_df = pd.DataFrame(user_news_score[:,1:],index=pd.Index(self.user_keys))
        score_df.columns = self.news_keys

        return score_df
    
    
    def recommend(self):    
        # 각 user별로, score 낮은 뉴스부터 높은 뉴스 순으로 
        self.score_df = self.score()
        argsort_df = np.argsort(self.score_df)
        rank_df = pd.DataFrame(np.array(self.news_keys)[argsort_df.values],index=self.score_df.index)


        # 각 user별로, score 높은 뉴스부터 낮은 뉴스 순으로
        # 칼럼명은 추천 순위
        rank_df = rank_df[rank_df.columns[::-1]]
        rank_df.columns = rank_df.columns[::-1]+1

        return rank_df.loc[:,:self.k]

In [None]:
%%time
calculator = NewsRecCal(model, hparams, valid_items_file, valid_behaviors_file)

305it [00:00, 558.40it/s]
699it [00:21, 32.96it/s]

CPU times: user 23.7 s, sys: 513 ms, total: 24.2 s
Wall time: 22.6 s





In [None]:
%%time
score_df = calculator.score() # 함수 버전과 마찬가지로, 19초만에 전체 추천 결과 도출!
score_df

CPU times: user 465 ms, sys: 3.26 ms, total: 469 ms
Wall time: 451 ms


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
489,2.667581,2.397735,1.283872,-0.591967,0.308801,-0.591967,0.174539,-0.591967,-0.591967,-0.591967,...,-0.020322,-0.591967,-0.591967,-0.591967,-0.591967,-0.591967,-0.591967,-0.591967,-0.591967,-0.591967
490,3.446257,2.831138,0.354795,-0.251179,-0.132704,-0.251179,0.141553,-0.251179,-0.251179,-0.251179,...,0.057519,-0.251179,-0.251179,-0.251179,-0.251179,-0.251179,-0.251179,-0.251179,-0.251179,-0.251179
491,3.334001,2.862745,0.823628,-0.429460,0.042003,-0.429460,0.179201,-0.429460,-0.429460,-0.429460,...,-0.060634,-0.429460,-0.429460,-0.429460,-0.429460,-0.429460,-0.429460,-0.429460,-0.429460,-0.429460
492,3.853889,1.779387,2.242132,-1.056541,0.931124,-1.056541,0.620227,-1.056541,-1.056541,-1.056541,...,-0.663384,-1.056541,-1.056541,-1.056541,-1.056541,-1.056541,-1.056541,-1.056541,-1.056541,-1.056541
493,4.102018,3.203485,1.826670,-0.925672,0.451614,-0.925672,0.624760,-0.925672,-0.925672,-0.925672,...,-0.577664,-0.925672,-0.925672,-0.925672,-0.925672,-0.925672,-0.925672,-0.925672,-0.925672,-0.925672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.604634,2.606436,1.966890,-0.775747,1.020938,-0.775747,0.459397,-0.775747,-0.775747,-0.775747,...,0.247178,-0.775747,-0.775747,-0.775747,-0.775747,-0.775747,-0.775747,-0.775747,-0.775747,-0.775747
607,4.160751,2.620541,2.551383,-1.216149,0.975861,-1.216149,0.712862,-1.216149,-1.216149,-1.216149,...,-0.720383,-1.216149,-1.216149,-1.216149,-1.216149,-1.216149,-1.216149,-1.216149,-1.216149,-1.216149
608,3.322134,2.998920,1.523911,-0.748143,0.550090,-0.748143,0.439682,-0.748143,-0.748143,-0.748143,...,-0.239732,-0.748143,-0.748143,-0.748143,-0.748143,-0.748143,-0.748143,-0.748143,-0.748143,-0.748143
609,4.508835,1.210237,2.785821,-1.364195,1.401688,-1.364195,0.885334,-1.364195,-1.364195,-1.364195,...,-1.281911,-1.364195,-1.364195,-1.364195,-1.364195,-1.364195,-1.364195,-1.364195,-1.364195,-1.364195


In [None]:
%%time
result_df = calculator.recommend() # 함수 버전과 마찬가지로, 19초만에 전체 추천 결과 도출!
result_df

CPU times: user 663 ms, sys: 186 ms, total: 850 ms
Wall time: 661 ms


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9733,9734,9735,9736,9737,9738,9739,9740,9741,9742
489,356,296,2959,5952,593,4878,110,5445,293,608,...,6770,127172,6232,148881,7061,30892,26717,176419,153070,57502
490,79132,5952,2959,260,72998,99114,32,7153,5445,356,...,4046,6232,176419,30892,6669,26717,153070,6770,7061,57502
491,2959,296,260,356,1196,5952,32,79132,110,293,...,8938,6669,148881,26717,6770,30892,7061,153070,176419,57502
492,480,380,2355,356,1196,296,593,377,589,1210,...,176419,122912,135536,104863,112421,187593,6818,119145,68848,138036
493,480,260,1196,589,380,356,1210,593,296,32,...,187593,104863,99917,57502,6818,68848,176419,119145,112421,138036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,296,356,593,2959,293,1196,110,608,4878,480,...,156605,30892,104863,68848,26717,6818,148881,153070,176419,57502
607,480,380,356,593,1196,296,589,2355,260,1210,...,135536,122912,112421,176419,104863,187593,6818,68848,119145,138036
608,480,296,356,260,1196,110,2959,589,293,593,...,6818,112421,187593,148881,68848,57502,30892,104863,153070,176419
609,2355,480,380,587,1,185,356,593,296,805,...,167746,104863,158966,6818,135536,122912,187593,68848,119145,138036


### Result

In [None]:
movies = pd.read_csv(os.path.join(data_path,'movies.csv'))
movie_dict = movies.set_index('movieId')['title'].to_dict()

result_df_real = result_df.astype(int).applymap(lambda x: movie_dict[x])

In [None]:
result_df_real.to_csv(os.path.join(data_path,'result', 'NRMS_result.csv'))
result_df_real

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9734,9735,9736,9737,9738,9739,9740,9741,9742,history
489,Forrest Gump (1994),Pulp Fiction (1994),Fight Club (1999),"Lord of the Rings: The Two Towers, The (2002)","Silence of the Lambs, The (1991)",Donnie Darko (2001),Braveheart (1995),Minority Report (2002),Léon: The Professional (a.k.a. The Professiona...,Fargo (1996),...,A Story of Children and Film (2013),Born Free (1966),World of Tomorrow (2015),Dark Victory (1939),In the Realms of the Unreal (2004),Begotten (1990),Mother! (2017),Rabbits (2002),Cat Soup (Nekojiru-so) (2001),"[Dracula (Bram Stoker's Dracula) (1992), Lady ..."
490,Inception (2010),"Lord of the Rings: The Two Towers, The (2002)",Fight Club (1999),Star Wars: Episode IV - A New Hope (1977),Avatar (2009),Django Unchained (2012),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"Lord of the Rings: The Return of the King, The...",Minority Report (2002),Forrest Gump (1994),...,Born Free (1966),Mother! (2017),In the Realms of the Unreal (2004),Ikiru (1952),Begotten (1990),Rabbits (2002),My Life Without Me (2003),Dark Victory (1939),Cat Soup (Nekojiru-so) (2001),"[Melancholia (2011), Simpsons Movie, The (2007..."
491,Fight Club (1999),Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Forrest Gump (1994),Star Wars: Episode V - The Empire Strikes Back...,"Lord of the Rings: The Two Towers, The (2002)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Inception (2010),Braveheart (1995),Léon: The Professional (a.k.a. The Professiona...,...,Ikiru (1952),World of Tomorrow (2015),Begotten (1990),My Life Without Me (2003),In the Realms of the Unreal (2004),Dark Victory (1939),Rabbits (2002),Mother! (2017),Cat Soup (Nekojiru-so) (2001),"[Forrest Gump (1994), Shrek (2001), Truman Sho..."
492,Jurassic Park (1993),True Lies (1994),"Bug's Life, A (1998)",Forrest Gump (1994),Star Wars: Episode V - The Empire Strikes Back...,Pulp Fiction (1994),"Silence of the Lambs, The (1991)",Speed (1994),Terminator 2: Judgment Day (1991),Star Wars: Episode VI - Return of the Jedi (1983),...,Avengers: Infinity War - Part I (2018),Suicide Squad (2016),What If (2013),Frank (2014),Deadpool 2 (2018),Come and See (Idi i smotri) (1985),Kingsman: The Secret Service (2015),"Brothers Bloom, The (2008)",The Man from U.N.C.L.E. (2015),"[Independence Day (a.k.a. ID4) (1996), Twelve ..."
493,Jurassic Park (1993),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,Terminator 2: Judgment Day (1991),True Lies (1994),Forrest Gump (1994),Star Wars: Episode VI - Return of the Jedi (1983),"Silence of the Lambs, The (1991)",Pulp Fiction (1994),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),...,What If (2013),Upstream Color (2013),Cat Soup (Nekojiru-so) (2001),Come and See (Idi i smotri) (1985),"Brothers Bloom, The (2008)",Mother! (2017),Kingsman: The Secret Service (2015),Frank (2014),The Man from U.N.C.L.E. (2015),"[Princess Bride, The (1987), Mercury Rising (1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,Pulp Fiction (1994),Forrest Gump (1994),"Silence of the Lambs, The (1991)",Fight Club (1999),Léon: The Professional (a.k.a. The Professiona...,Star Wars: Episode V - The Empire Strikes Back...,Braveheart (1995),Fargo (1996),Donnie Darko (2001),Jurassic Park (1993),...,In the Realms of the Unreal (2004),What If (2013),"Brothers Bloom, The (2008)",Begotten (1990),Come and See (Idi i smotri) (1985),World of Tomorrow (2015),Rabbits (2002),Mother! (2017),Cat Soup (Nekojiru-so) (2001),"[Unknown (2006), Whatever Works (2009), Alice ..."
607,Jurassic Park (1993),True Lies (1994),Forrest Gump (1994),"Silence of the Lambs, The (1991)",Star Wars: Episode V - The Empire Strikes Back...,Pulp Fiction (1994),Terminator 2: Judgment Day (1991),"Bug's Life, A (1998)",Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode VI - Return of the Jedi (1983),...,Avengers: Infinity War - Part I (2018),Frank (2014),Mother! (2017),What If (2013),Deadpool 2 (2018),Come and See (Idi i smotri) (1985),"Brothers Bloom, The (2008)",Kingsman: The Secret Service (2015),The Man from U.N.C.L.E. (2015),"[True Crime (1996), Gone with the Wind (1939),..."
608,Jurassic Park (1993),Pulp Fiction (1994),Forrest Gump (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,Braveheart (1995),Fight Club (1999),Terminator 2: Judgment Day (1991),Léon: The Professional (a.k.a. The Professiona...,"Silence of the Lambs, The (1991)",...,Frank (2014),Deadpool 2 (2018),World of Tomorrow (2015),"Brothers Bloom, The (2008)",Cat Soup (Nekojiru-so) (2001),In the Realms of the Unreal (2004),What If (2013),Rabbits (2002),Mother! (2017),"[Island, The (2005), It's All Gone Pete Tong (..."
609,"Bug's Life, A (1998)",Jurassic Park (1993),True Lies (1994),Ghost (1990),Toy Story (1995),"Net, The (1995)",Forrest Gump (1994),"Silence of the Lambs, The (1991)",Pulp Fiction (1994),"Time to Kill, A (1996)",...,What If (2013),Captain Fantastic (2016),Come and See (Idi i smotri) (1985),Suicide Squad (2016),Avengers: Infinity War - Part I (2018),Deadpool 2 (2018),"Brothers Bloom, The (2008)",Kingsman: The Secret Service (2015),The Man from U.N.C.L.E. (2015),"[Apollo 13 (1995), Pulp Fiction (1994), Dances..."


In [None]:
result_df[1].value_counts()

480      38
296      21
79132    16
356      11
260      10
2355     10
5952      7
2959      6
593       2
58559     1
Name: 1, dtype: int64

In [None]:
behav_unique = pd.read_csv(behaviors_file,sep='\t',header=None).drop_duplicates(subset=[0])
behav_unique[4] = behav_unique[2].apply(lambda x: x.split(' '))

def history_to_movies(history_list):
    for i in range(len(history_list)):
        history_list[i] = movie_dict[int(history_list[i])]
    return history_list

behav_unique[4] = behav_unique[4].apply(lambda x: history_to_movies(x))

user_history = behav_unique.set_index(0)[4].to_dict()
result_df_real['history'] = pd.DataFrame(result_df_real.reset_index()['index'].apply(lambda x: user_history[x])).set_index(result_df_real.index)

In [None]:
user_nth = np.random.randint(result_df_real.shape[0])
history_num = 10
rec_num = 10

print(result_df_real['history'].iloc[user_nth][-history_num:], '\n\n', result_df_real.iloc[user_nth][:rec_num])

['Heartbreak Kid, The (2007)', 'Before Sunrise (1995)', 'Hangover, The (2009)', 'Legally Blonde (2001)', 'Harry Potter and the Chamber of Secrets (2002)', 'Harry Potter and the Deathly Hallows: Part 1 (2010)', 'Harry Potter and the Deathly Hallows: Part 2 (2011)', 'Harry Potter and the Half-Blood Prince (2009)', 'Harry Potter and the Order of the Phoenix (2007)', 'Avatar (2009)'] 

 1                                      Inception (2010)
2         Lord of the Rings: The Two Towers, The (2002)
3                                     Fight Club (1999)
4             Star Wars: Episode IV - A New Hope (1977)
5                                         Avatar (2009)
6                               Django Unchained (2012)
7             Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
8     Lord of the Rings: The Return of the King, The...
9                                Minority Report (2002)
10                                  Forrest Gump (1994)
Name: 490, dtype: object
