# Modelo Treinado

Neste notebook, encontramos um modelo já treinado e só estamos usando os pesos que obtivemos 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/Shareddrives/Filmes Pra TI - Machine Learning/Modelo_Marcela

In [None]:
import time
import os
import shutil
# import papermill as pm
import pandas as pd
import numpy as np
import tensorflow as tf
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from reco_utils.common.constants import SEED as DEFAULT_SEED
import csv

In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.compat.v1 as v1
tf.compat.v1.disable_eager_execution()
from time import time
import logging

logger = logging.getLogger(__name__)

class NCF:
    """NCF implementation"""

    def __init__(
        self,
        n_users,
        n_items,
        model_type="NeuMF",
        random_state=0,
        n_factors=8,
        layer_sizes=[16, 8, 4],
        n_epochs=50,
        batch_size=512,
        learning_rate=5e-3,
        verbose=1,
        save=False,
        pretrain=False,
        seed=42,
        ckpt_file='model.ckpt'
    ):
        # number of users in dataset
        self.n_users = n_users
        # number of items in dataset
        self.n_items = n_items
        # model type
        self.model_type = model_type.lower()

        # filename of the model
        self.ckpt_file = ckpt_file
        # check model type
        model_options = ["gmf", "mlp", "neumf"]
        if self.model_type not in model_options:
            raise ValueError(
                "Wrong model type, please select one of this list: {}".format(
                    model_options
                )
            )
        # seed
        tf.random.set_seed(seed)
        np.random.seed(seed)
        # dimension of latent space
        self.n_factors = n_factors
        # number of layers for mlp
        self.layer_sizes = layer_sizes
        # number of epochs for training
        self.n_epochs = n_epochs
        # training output or not
        self.verbose = verbose
        # set batch size
        self.batch_size = batch_size
        # set learning rate
        self.learning_rate = learning_rate
        # ncf layer input size
        self.ncf_layer_size = n_factors + layer_sizes[-1]
        # create ncf model
        self._create_model()
        # set GPU use with demand growth
        gpu_options = v1.GPUOptions(allow_growth=True)
        # set TF Session
        self.sess = v1.Session(config=v1.ConfigProto(gpu_options=gpu_options))
        # parameters initialization
        self.sess.run(v1.global_variables_initializer())

    def _create_model(self,):
        # reset graph
        from tensorflow.python.framework import ops
        ops.reset_default_graph()

        with tf.compat.v1.variable_scope("input_data", reuse=tf.compat.v1.AUTO_REUSE):

            # input: index of users, items and ground truth
            self.user_input = v1.placeholder(tf.int32, shape=[None, 1])
            self.item_input = v1.placeholder(tf.int32, shape=[None, 1])
            self.labels = v1.placeholder(tf.float32, shape=[None, 1])

        with v1.variable_scope("embedding", reuse=tf.compat.v1.AUTO_REUSE):

            # set embedding table
            self.embedding_gmf_P = tf.Variable(
                v1.truncated_normal(
                    shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01
                ),
                name="embedding_gmf_P",
                dtype=tf.float32,
            )

            self.embedding_gmf_Q = tf.Variable(
                v1.truncated_normal(
                    shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01
                ),
                name="embedding_gmf_Q",
                dtype=tf.float32,
            )

            # set embedding table
            self.embedding_mlp_P = tf.Variable(
                v1.truncated_normal(
                    shape=[self.n_users, int(self.layer_sizes[0] / 2)],
                    mean=0.0,
                    stddev=0.01,
                ),
                name="embedding_mlp_P",
                dtype=tf.float32,
            )

            self.embedding_mlp_Q = tf.Variable(
                v1.truncated_normal(
                    shape=[self.n_items, int(self.layer_sizes[0] / 2)],
                    mean=0.0,
                    stddev=0.01,
                ),
                name="embedding_mlp_Q",
                dtype=tf.float32,
            )

        with tf.compat.v1.variable_scope("gmf", reuse=tf.compat.v1.AUTO_REUSE):

            # get user embedding p and item embedding q
            self.gmf_p = tf.reduce_sum(
                tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1
            )
            self.gmf_q = tf.reduce_sum(
                tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1
            )

            # get gmf vector
            self.gmf_vector = self.gmf_p * self.gmf_q

        with tf.compat.v1.variable_scope("mlp", reuse=tf.compat.v1.AUTO_REUSE):

            # get user embedding p and item embedding q
            self.mlp_p = tf.reduce_sum(
                tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1
            )
            self.mlp_q = tf.reduce_sum(
                tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1
            )

            # concatenate user and item vector
            output = tf.concat([self.mlp_p, self.mlp_q], 1)

            # MLP Layers
            for layer_size in self.layer_sizes[1:]:
                output = v1.layers.dense(
                    output, layer_size, activation=tf.nn.relu
                )
            self.mlp_vector = output

            # self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True))

        with tf.compat.v1.variable_scope("ncf", reuse=tf.compat.v1.AUTO_REUSE):

            if self.model_type == "gmf":
                # GMF only
                output = v1.layers.dense(
                    self.gmf_vector,
                    1,
                    activation=None,
                    biases_initializer=None,
                )
                self.output = tf.sigmoid(output)

            elif self.model_type == "mlp":
                # MLP only
                output = v1.layers.dense(
                    self.mlp_vector,
                    1,
                    activation=None,
                    bias_initializer=None,
                )
                self.output = tf.sigmoid(output)

            elif self.model_type == "neumf":
                # concatenate GMF and MLP vector
                self.ncf_vector = tf.concat([self.gmf_vector, self.mlp_vector], 1)
                # get predicted rating score
                output = v1.layers.dense(
                    self.ncf_vector,
                    1,
                    activation=None,
                    bias_initializer=None,
                )
                self.output = tf.sigmoid(output)

        with v1.variable_scope("loss", reuse=v1.AUTO_REUSE):

            # set loss function
            self.loss = v1.losses.log_loss(self.labels, self.output)

        with v1.variable_scope("optimizer", reuse=v1.AUTO_REUSE):

            # set optimizer
            self.optimizer = v1.train.AdamOptimizer(
                learning_rate=self.learning_rate
            ).minimize(self.loss)

    def save(self, dir_name):
        """ save model parameters in `dir_name`
            Args:
                dir_name (str) : directory name, which should be folder name instead of file name
                    we will create a new directory if not existing.
        """
        # save trained model
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        saver = tf.compat.v1.train.Saver()
        saver.save(self.sess, os.path.join(dir_name, self.ckpt_file))

    def load(self, gmf_dir=None, mlp_dir=None, neumf_dir=None, alpha=0.5):
        """ load model parameters for further use.
            GMF model --> load parameters in `gmf_dir`
            MLP model --> load parameters in `mlp_dir`
            NeuMF model --> load parameters in `neumf_dir` or in `gmf_dir` and `mlp_dir`
            Args:
                gmf_dir, mlp_dir, neumf_dir ( str or None ): model parameters directory name
            Returns:
                load parameters in this model
        """

        # load pre-trained model
        if self.model_type == "gmf" and gmf_dir is not None:
            saver = tf.compat.v1.train.Saver()
            saver.restore(self.sess, os.path.join(gmf_dir, self.ckpt_file))

        elif self.model_type == "mlp" and mlp_dir is not None:
            saver = tf.compat.v1.train.Saver()
            saver.restore(self.sess, os.path.join(mlp_dir, self.ckpt_file))

        elif self.model_type == "neumf" and neumf_dir is not None:
            saver = tf.compat.v1.train.Saver()
            saver.restore(self.sess, os.path.join(neumf_dir, self.ckpt_file))

        elif self.model_type == "neumf" and gmf_dir is not None and mlp_dir is not None:
            # load neumf using gmf and mlp
            self._load_neumf(gmf_dir, mlp_dir, alpha)

        else:
            raise NotImplementedError

    def _load_neumf(self, gmf_dir, mlp_dir, alpha):
        """ load gmf and mlp model parameters for further use in NeuMF.
            NeuMF model --> load parameters in `gmf_dir` and `mlp_dir`
            Args:
                gmf_dir, mlp_dir ( str or None ): model parameters directory name
                alpha ( float ): the concatenation hyper-parameter for gmf and mlp output layer
            Returns:
                load parameters in NeuMF model
        """
        # load gmf part
        variables = tf.global_variables()
        # get variables with 'gmf'
        var_flow_restore = [
            val for val in variables if "gmf" in val.name and "ncf" not in val.name
        ]
        # load 'gmf' variable
        saver = tf.compat.v1.train.Saver(var_flow_restore)
        # restore
        saver.restore(self.sess, os.path.join(gmf_dir, self.ckpt_file))

        # load mlp part
        variables = v1.global_variables()
        # get variables with 'gmf'
        var_flow_restore = [
            val for val in variables if "mlp" in val.name and "ncf" not in val.name
        ]
        # load 'gmf' variable
        saver = tf.compat.v1.train.Saver(var_flow_restore)
        # restore
        saver.restore(self.sess, os.path.join(mlp_dir, self.ckpt_file))

        # concat pretrain h_from_gmf and h_from_mlp
        vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ncf")

        assert len(vars_list) == 1
        ncf_fc = vars_list[0]

        # get weight from gmf and mlp
        gmf_fc = tf.contrib.framework.load_variable(gmf_dir, ncf_fc.name)
        mlp_fc = tf.contrib.framework.load_variable(mlp_dir, ncf_fc.name)

        # load fc layer by tf.concat
        assign_op = tf.assign(
            ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0)
        )
        self.sess.run(assign_op)

    def fit(self, data):
        """ fit model with training data
            
            Args: 
                data ( NCFDataset ): initilized Dataset in ./dataset.py
        """

        # get user and item mapping dict
        self.user2id = data.user2id
        self.item2id = data.item2id
        self.id2user = data.id2user
        self.id2item = data.id2item

        # loop for n_epochs
        for epoch_count in range(1, self.n_epochs + 1):

            # negative sampling for training
            train_begin = time()
            data.negative_sampling()

            # initialize
            train_loss = []

            # calculate loss and update NCF parameters
            for user_input, item_input, labels in data.train_loader(self.batch_size):

                user_input = np.array([data.user2id[x] for x in user_input])
                item_input = np.array([data.item2id[x] for x in item_input])
                labels = np.array(labels)

                feed_dict = {
                    self.user_input: user_input[..., None],
                    self.item_input: item_input[..., None],
                    self.labels: labels[..., None],
                }

                # get loss and execute optimization
                loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict)
                train_loss.append(loss)
            train_time = time() - train_begin

            # output every self.verbose
            if self.verbose and epoch_count % self.verbose == 0:
                logger.info(
                    "Epoch %d [%.2fs]: train_loss = %.6f "
                    % (epoch_count, train_time, sum(train_loss) / len(train_loss))
                )

    def predict(self, user_input, item_input, user2id, item2id, is_list=False):
        """ predict function of this trained model
            Args:
                user_input ( list or element of list ): userID or userID list 
                item_input ( list or element of list ): itemID or itemID list
                is_list ( bool ): if true, the input is list type
                noting that list-wise type prediction is faster than element-wise's.
            Returns:
                list or float: list of predicted rating or predicted rating score. 
        """

        if is_list:
            output = self._predict(user_input, item_input, user2id, item2id)
            return list(output.reshape(-1))

        else:
            output = self._predict(np.array([user_input]), np.array([item_input]), user2id, item2id)
            return float(output.reshape(-1)[0])

    def _predict(self, user_input, item_input, user2id, item2id):

        # index converting
        user_input = np.array([user2id[x] for x in user_input])
        item_input = np.array([item2id[x] for x in item_input])

        # get feed dict
        feed_dict = {
            self.user_input: user_input[..., None],
            self.item_input: item_input[..., None],
        }

        # calculate predicted score
        output = self.sess.run(self.output, feed_dict)
        return output

In [None]:
ml_1m = pd.read_csv('ratings_1M.dat', sep = "::",
    names = ["userID", "itemID", "rating", "timestamp"],
)
ml_1m.head()

In [None]:
rating_100k = pd.read_csv('ratings_100k.csv')
rating_100k.head()

In [None]:
rating_100k.rename(columns={'movieId':'itemID', 'userId':'userID'}, inplace=True)

In [None]:
# col_user ='userId', col_item = 'movieId', col_timestamp ='timestamp'
train, test = python_chrono_split(rating_100k, 0.75)
train['rating'] = np.where(train['rating']>2.5, 1, 0)
test['rating'] = np.where(test['rating']>2.5, 1, 0)
train

In [None]:
train.to_csv('train_ratings_100k.csv', index=False)

In [None]:
seed = 710
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 100
BATCH_SIZE = 512

seed=710  # Set None for non-deterministic results
data = NCFDataset(train=train, test=test, seed=seed)

In [None]:
set(train['rating'])

In [None]:
model = NCF(n_users=data.n_users,n_items=data.n_items,model_type="NeuMF",n_factors=4,layer_sizes=[16,8,4],\
            n_epochs=EPOCHS,batch_size=BATCH_SIZE,learning_rate=1e-3,verbose=10,seed=seed, ckpt_file='model_100k.ckpt')
# n_factors (int): Dimension of latent space.
# layer_sizes (list): Number of layers for MLP.

In [None]:
# if want to load a previous model
model.load(neumf_dir='model_100k')

In [None]:
# training the model
model.fit(data)

In [None]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID, data.user2id, data.item2id)]
               for (_, row) in test.iterrows()]

# saving the predictions in a dataframe
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

In [None]:
users, items, preds = [], [], []
item = list(train.itemID.unique())
for user in train.userID.unique():
    user = [user] * len(item) 
    users.extend(user)
    items.extend(item)
    preds.extend(list(model.predict(user, item, data.user2id, data.item2id, is_list=True)))

all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

In [None]:
# all_predictions_100k = pd.read_csv('all_predictions_100k.csv')
all_predictions_100k = all_predictions

In [None]:
all_predictions_100k.head()

In [None]:
all_predictions_100k['binary_prediction'] = all_predictions_100k.prediction.apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
all_predictions_100k.head()

In [None]:
all_predictions_100k.to_csv('all_predictions_100k.csv', index=False)

In [None]:
TOP_K = 3

In [None]:
eval_map = map_at_k(test, all_predictions_100k, col_prediction='binary_prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions_100k, col_prediction='binary_prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions_100k, col_prediction='binary_prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions_100k, col_prediction='binary_prediction', k=TOP_K)

In [None]:
eval_map, eval_ndcg, eval_precision, eval_recall

Trazendo recomendações

In [None]:
recs = all_predictions[all_predictions.userID == 1]

In [None]:
recs.sort_values(by='prediction', ascending=False).head(30)

Saving model

In [None]:
model.save(dir_name="model_100k")

Loading model

In [None]:
tf.__version__

In [None]:
model2 = NCF(n_users=data.n_users,n_items=data.n_items,model_type="NeuMF", ckpt_file='model_100k.ckpt')

In [None]:
model2.load(neumf_dir='model_100k')

In [None]:
predictions_teste = [[row.userID, row.itemID, model2.predict(row.userID, row.itemID, data.user2id, data.item2id)]
               for (_, row) in test.iterrows()]

In [None]:
predictions_teste

Metrics of 1M model

In [None]:
all_predictions_1M = pd.read_csv('all_predictions_1M.csv')

In [None]:
all_predictions_1M['binary_prediction'] = all_predictions_1M.prediction.apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
all_predictions_1M.head()

In [None]:
eval_map = map_at_k(test, all_predictions_1M, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions_1M, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions_1M, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions_1M, col_prediction='prediction', k=TOP_K)

In [None]:
eval_map, eval_ndcg, eval_precision, eval_recall