In [3]:

import pandas as pd
import numpy as np

In [41]:
df_ratings:pd.DataFrame = pd.DataFrame(data={
    "user":   [0,0,1,1,2,3,3,4,4,5], # 0-index
    "item":   [0,2,0,1,3,0,2,4,1,2], # 0-index
    "rating": [5,4,5,1,5,5,5,2,3,4]
})

df_ratings["rating"] = df_ratings["rating"].astype("float32")

df_ratings.head()

Unnamed: 0,user,item,rating
0,0,0,5.0
1,0,2,4.0
2,1,0,5.0
3,1,1,1.0
4,2,3,5.0


In [42]:
import math
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import SparseTensor
from lib.similarity_measures import *


def buildSparseTensorRatings(df_ratings,dense_shape)->SparseTensor:
    """
    This is the matrix to predict
    Simplify a big matrix into a tensor
    """
    indices = df_ratings[["user", "item"]].values
    values = df_ratings["rating"].values
    return SparseTensor(
      indices=indices,
      values=values,
      dense_shape=dense_shape)

def sparse_mean_square_error(user_embeddings, item_embeddings, sparse_ratings):
    """
    Args:
    sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
    user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of user i.
    item_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of item j.
    Returns:
    A scalar Tensor representing the MSE between the true ratings and the
      model's predictions.
    """

    predictions = tf.reduce_sum(  # de los indices usuario movie creatods, los usa para obtener de los datos verdaderos
      tf.gather(user_embeddings, sparse_ratings.indices[:, 0]) *
      tf.gather(item_embeddings, sparse_ratings.indices[:, 1]),
      axis=1)

    loss = tf.reduce_sum(tf.add(sparse_ratings.values, - predictions) ** 2) / tf.cast(predictions.shape[0], tf.float32)  # mean squared error
    return loss



class CFModel(object):

  """Simple class that represents a collaborative filtering model"""

  def __init__(self, embedding_vars):
    """Initializes a CFModel.
    Args:
      embedding_vars: A dictionary of tf.Variables.
      loss: A float Tensor. The loss to optimize.
      metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
    """
    self._embedding_vars = embedding_vars
    self._embeddings = {k: None for k in embedding_vars}


  @property
  def embeddings(self):
    """The embeddings dictionary."""
    return self._embeddings

  def train(self, tensor_train, num_iterations=20, learning_rate=0.01, optimizer=tf.keras.optimizers.SGD, verbosity=1):  # tf.keras.optimizers.SGD() tensorflow 2 = tf.train.GradientDescentOptimizer  tensorflow 1
    """Trains the model.
    Args:
      iterations: number of iterations to run.
      learning_rate: optimizer learning rate.
      plot_results: whether to plot the results at the end of training.
      optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    """

    U = self._embedding_vars['user']
    V = self._embedding_vars['item']
    opt = optimizer(learning_rate=learning_rate)
    var_list = [U, V]
    loss_fn = lambda: sparse_mean_square_error(U, V, tensor_train)

    debug_step = int(num_iterations/10)
    # Train and append results.
    for i in range(num_iterations + 1):  # tqdm(range(num_iterations + 1))
        opt.minimize(loss_fn, var_list)
        if i % debug_step == 0 or i == 0 or i==num_iterations:
          print("Training error in iteration %i" % i, sparse_mean_square_error(U, V, tensor_train))

    for k, v in self._embedding_vars.items():
        self._embeddings[k] = v.numpy()

    return U, V

  def candidateGeneration(self,user,k=3):
    # user: id 0-index
    scores = None
    try:
        scores = dot_product_with_norms_controlled(
              self.embeddings["user"][user], self.embeddings["item"].T
        )
    except IndexError as e:
        # no hay recomendaciones para ese usuario
        return []

    df = pd.DataFrame({
        "score": list(scores),
        'item': list(range(len(scores)))
    })
    
    return df.sort_values(["score"], ascending=False)["item"].values[0:k]

  def predict(self, U, V):
    return tf.matmul(U, V, transpose_b=True)



def buildModel(df_ratings:pd.DataFrame, embedding_dim=30, init_stddev=1, num_iterations=500, learning_rate=0.03, verbosity=1):
    """
    df_ratings: have columns ["user","item","rating"]
    """

    dense_shape = [df_ratings["user"].max()+1, df_ratings["item"].max()+1]
    # Split the ratings DataFrame into train and test.
    X_test = df_ratings.sample(frac=0.2, replace=False).astype('float32')
    X_train = df_ratings[~df_ratings.index.isin(X_test.index)].astype('float32')

    # SparseTensor representation of the train and test datasets.
    # Its for optimization
    tensor_train = buildSparseTensorRatings(X_train,dense_shape)
    tensor_test = buildSparseTensorRatings(X_test,dense_shape)


    # Initialize the embeddings using a normal distribution.
    U = tf.Variable(tf.random.normal([tensor_train.dense_shape[0], embedding_dim], stddev=init_stddev), dtype="float32")  # stddev indicará que tan dispersos estarán los datos
    V = tf.Variable(tf.random.normal([tensor_train.dense_shape[1], embedding_dim], stddev=init_stddev), dtype="float32")  # mientras más alto estarán más dispersos

    embeddings = {
      "user": U,
      "item": V
    }

    model = CFModel(embeddings)

    U, V = model.train(tensor_train,num_iterations=num_iterations, learning_rate=learning_rate, verbosity=verbosity)

    print("TEST LOSS", sparse_mean_square_error(U, V, tensor_test))

    return model


In [44]:
model = buildModel(df_ratings,
            embedding_dim=30,
            init_stddev=1,
            num_iterations=100,
            learning_rate=0.001,)

model.candidateGeneration(1)

Training error in iteration 0 tf.Tensor(16.976624, shape=(), dtype=float32)
Training error in iteration 10 tf.Tensor(12.1664715, shape=(), dtype=float32)
Training error in iteration 20 tf.Tensor(8.755703, shape=(), dtype=float32)
Training error in iteration 30 tf.Tensor(6.3272185, shape=(), dtype=float32)
Training error in iteration 40 tf.Tensor(4.59147, shape=(), dtype=float32)
Training error in iteration 50 tf.Tensor(3.346208, shape=(), dtype=float32)
Training error in iteration 60 tf.Tensor(2.4494772, shape=(), dtype=float32)
Training error in iteration 70 tf.Tensor(1.8012483, shape=(), dtype=float32)
Training error in iteration 80 tf.Tensor(1.3307941, shape=(), dtype=float32)
Training error in iteration 90 tf.Tensor(0.987953, shape=(), dtype=float32)
Training error in iteration 100 tf.Tensor(0.73703706, shape=(), dtype=float32)
TEST LOSS tf.Tensor(27.920563, shape=(), dtype=float32)


array([2, 0, 1])