# 1 - Install

In [None]:
!pip install merlin-models
!pip install nvtabular
!pip install implicit
!pip install cython

# 2 - Imports

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import drive
import os
import shutil
import numpy as np
from numpy.ma import count
import cython
from scipy.sparse import coo_matrix, csr_matrix
from tqdm.auto import tqdm

import nvtabular as nvt
from nvtabular.ops import AddMetadata

from merlin.core.dispatch import get_lib
from merlin.schema.tags import Tags
from merlin.io.dataset import Dataset
from merlin.models.utils.dataset import dataset_to_coo
from merlin.models.implicit import AlternatingLeastSquares

# 3 - Método com recall

In [23]:
from numpy import double


class AlternatingLeastSquaresWithRecall(AlternatingLeastSquares):
  def ranking_metrics_at_k_recall(model, train_user_items, test_user_items, K=10,
                        show_progress=True, num_threads=1):
    """ Calculates ranking metrics for a given trained model
    Parameters
    ----------
    model : RecommenderBase
        The fitted recommendation model to test
    train_user_items : csr_matrix
        Sparse matrix of user by item that contains elements that were used
            in training the model
    test_user_items : csr_matrix
        Sparse matrix of user by item that contains withheld elements to
        test on
    K : int
        Number of items to test on
    show_progress : bool, optional
        Whether to show a progress bar
    num_threads : int, optional
        The number of threads to use for testing. Specifying 0 means to default
        to the number of cores on the machine. Note: aside from the ALS and BPR
        models, setting this to more than 1 will likely hurt performance rather than
        help.
    Returns
    -------
    float
        the calculated p@k
    """
    if not isinstance(train_user_items, csr_matrix):
      train_user_items = train_user_items.tocsr()

    if not isinstance(test_user_items, csr_matrix):
      test_user_items = test_user_items.tocsr()

    users = test_user_items.shape[0] 
    items = test_user_items.shape[1]
    u = 0
    i = 0
    batch_idx = 0
    # precision
    relevant = 0.0
    pr_div = 0.0
    total = 0.0 
    total_recall = 0.0
    # map
    mean_ap = 0.0
    ap = 0.0
    # ndcg
    cg = (1.0 / np.log2(np.arange(2, K + 2)))
    cg_sum = np.cumsum(cg)
    ndcg = 0.0 
    idcg = 0
    # auc
    mean_auc = 0.0
    auc = 0.0
    hit = 0.0
    miss = 0.0
    num_pos_items = 0.0
    num_neg_items = 0.0

    test_indptr = test_user_items.indptr
    test_indices = test_user_items.indices

    ids = [[],[]]
    batch = []

    likes = []

    batch_size = 1000
    start_idx = 0

    # get an array of userids that have at least one item in the test set
    to_generate = np.arange(users, dtype="int32")
    to_generate = to_generate[np.ediff1d(test_user_items.indptr) > 0]

    progress = tqdm(total=len(to_generate), disable=not show_progress)
    total_recall = total

    while start_idx < len(to_generate):
      batch = to_generate[start_idx: start_idx + batch_size]
      ids, _ = model.recommend(batch, train_user_items[batch], N=K)
      start_idx += batch_size

      #with nogil:
      for batch_idx in range(len(batch)):
        u = batch[batch_idx]
        likes.clear()
        for i in range(test_indptr[u], test_indptr[u+1]):
          likes.append(test_indices[i])

        pr_div += min(K, count(likes))
        ap = 0
        hit = 0
        miss = 0
        auc = 0
        idcg = cg_sum[min(K, count(likes)) - 1]
        num_pos_items = count(likes)
        num_neg_items = items - num_pos_items
        
        for i in range(K):
          if ((ids[batch_idx, i] in likes) and (likes.index(ids[batch_idx, i]) != count(likes))): #likes.end()
            relevant += 1
            hit += 1
            ap += hit / (i + 1)
            ndcg += cg[i] / idcg
          else:
            miss += 1
            auc += hit
        auc += ((hit + num_pos_items) / 2.0) * (num_neg_items - miss)
        mean_ap += ap / min(K, count(likes))
        mean_auc += auc / (num_pos_items * num_neg_items)
        total += 1
        total_recall += count(likes)

      progress.update(len(batch))

    progress.close()
    # print("precision", relevant / pr_div,
    #       "map", mean_ap / total,
    #       "ndcg", ndcg / total,
    #       "auc", mean_auc / total,
    #       "recall", mean_auc / total_recall    
    #)
    return {
      "precision": relevant / pr_div,
      "map": mean_ap / total,
      "ndcg": ndcg / total,
      "auc": mean_auc / total,
      "recall": mean_auc / total_recall
    }

  def evaluate_recall(self, test_dataset: Dataset, k=10):
        """Evaluates the model

        This function evaluates using a variety of ranking metrics, and returns
        a dictionary of {metric_name: value}.

        Parameters
        ----------
        test_dataset : merlin.io.Dataset
            The validation dataset to evaluate
        k : int
            How many items to return per prediction. By default this method will
            return metrics like 'map@10' , but by increasing k you can generate
            different versions
        """
        test = dataset_to_coo(test_dataset).tocsr()
        ret = AlternatingLeastSquaresWithRecall.ranking_metrics_at_k_recall(
            self.implicit_model,
            self.train_data,
            test,
            K=k,
        )
        return {metric + f"@{k}": value for metric, value in ret.items()} 

# 4 - Diretórios

## 4.1 - Remover diretórios

In [4]:
if os.path.exists('train.parquet'):
  os.remove('train.parquet')
if os.path.exists('test.parquet'):
  os.remove('test.parquet')
shutil.rmtree('train', ignore_errors=True)
shutil.rmtree('test', ignore_errors=True)

## 4.2 - Google Drive


In [None]:
drive.mount(r'/content/drive/')
!ls "/content/drive/MyDrive/Dataset"

# 5 - Preliminaries



In [6]:
COL_USER = "user_id"
COL_ITEM = "item_id"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"
COL_PREDICTION = "prediction"
TOP_K = 10
INPUT_DATA_DIR = '/content/drive/MyDrive/Dataset/'
df_lib = get_lib()

# 6 - Dataset

In [7]:
columns = [COL_USER, COL_ITEM, COL_RATING]
train = df_lib.read_csv(os.path.join(INPUT_DATA_DIR, "SMDI-500E_train.csv"), names=columns)
test = df_lib.read_csv(os.path.join(INPUT_DATA_DIR, "SMDI-500E_test.csv"), names=columns)

In [8]:
train.to_parquet("train.parquet")
test.to_parquet("test.parquet")

In [None]:
train_dataset = Dataset("train.parquet", cpu=True)
test_dataset = Dataset("test.parquet", cpu=True)

In [10]:
user_id = ["user_id"] >> AddMetadata(tags=[Tags.USER_ID, "user_id"]) 
item_id = ["item_id"] >> AddMetadata(tags=[Tags.ITEM_ID, "item_id"])
rating = ["rating"] >> AddMetadata(tags=[Tags.TARGET])

In [None]:
workflow = nvt.Workflow(user_id + item_id + rating)
workflow.fit(train_dataset)
workflow.transform(train_dataset).to_parquet("train")
workflow.transform(test_dataset).to_parquet("test")

In [None]:
train_transformed = Dataset("train/*.parquet", cpu=True)
test_transformed = Dataset("test/*.parquet", cpu=True)

# 7 - Treinando

In [None]:
als = AlternatingLeastSquaresWithRecall(
    regularization=0.01, 
    iterations=500, 
    factors=1,
    use_gpu=False
  )
als.fit(train_transformed)

# 8 - Avaliando

In [None]:
implicit_metrics = als.evaluate_recall(test_transformed)
implicit_preds = als.predict(test_transformed)

## 8.1 - Teste usuário 17

In [None]:
filter = [("user_id", "in", [17])]
user_predicted = als.predict(Dataset("test/*.parquet", cpu=True, filters=filter))

In [None]:
user_predicted[0]

In [None]:
user_predicted[1]

In [56]:
import pandas as pd
pd.DataFrame({'user_id': 17, 'item_id': user_predicted[0][0], 'rating' : user_predicted[1][0]})

Unnamed: 0,user_id,item_id,rating
0,17,132,0.950725
1,17,64,0.745661
2,17,55,0.719344
3,17,53,0.654296
4,17,65,0.606072
5,17,123,0.586571
6,17,2324,0.568952
7,17,808,0.564506
8,17,2987,0.460434
9,17,24,0.428644
