In [1]:
# hide
import datetime
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split

2023-04-20 17:49:28.605234: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-20 17:49:28.606099: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-20 17:49:28.624201: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-20 17:49:28.624649: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# hide
print(f"Tensorflow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Tensorflow version: 2.12.0
Pandas version: 1.5.2
Numpy version: 1.23.5


In [3]:
# hide
TOP_K = 5
N_EPOCHS = 10

In [4]:
from typing import List


def wide_to_long(wide: np.array, possible_ratings: List[int]) -> np.array:
    """Go from wide table to long.
    :param wide: wide array with user-item interactions
    :param possible_ratings: list of possible ratings that we may have."""

    def _get_ratings(arr: np.array, rating: int) -> np.array:
        """Generate long array for the rating provided
        :param arr: wide array with user-item interactions
        :param rating: the rating that we are interested"""
        idx = np.where(arr == rating)
        return np.vstack(
            (idx[0], idx[1], np.ones(idx[0].size, dtype="int8") * rating)
        ).T

    long_arrays = []
    for r in possible_ratings:
        long_arrays.append(_get_ratings(wide, r))

    return np.vstack(long_arrays)

In [5]:
dataset = pd.read_csv('../data/dataset.csv')
X, y = dataset.drop(columns=["interaction"]), dataset.interaction
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42
                                                    )
df_train = X_train.join(y_train).astype(int)
df_test = X_test.join(y_test).astype(int)

In [6]:
len(df_train.item_id.unique())

109

In [7]:
# hide_input
print("Only positive interactions:")
df_train[df_train["interaction"] > 0].head()

Only positive interactions:


Unnamed: 0,user_id,item_id,interaction
613,13,9,1
6255,30,318,1
8127,27,402,1
3027,27,99,1
2653,28,85,1


# The model (Neural Collaborative Filtering)

<center><img src="https://raw.githubusercontent.com/murilo-cunha/inteligencia-superficial/master/images/2020-09-11-neural_collaborative_filter/ncf_all_with_alpha.png" width="70%" url="https://developers.google.com/machine-learning/recommendation/collaborative/basics" description="Fonte: https://developers.google.com/machine-learning/recommendation/collaborative/basics" /> </center>

In [8]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2


def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 32,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name="user_id")
    item = Input(shape=(), dtype="int32", name="item_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

In [16]:
pd.pivot_table(dataset, values='interaction', index='user_id', columns='item_id').fillna(0).shape

(75, 109)

In [17]:
# collapse
from tensorflow.keras.optimizers import Adam

n_users, n_items = pd.pivot_table(dataset, values='interaction', index='user_id', columns='item_id').fillna(0).shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 item_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 32)          2400        ['user_id[0][0]']                
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 32)          3488        ['item_id[0][0]']                
                                                                     

In [23]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.0,
    batch_size: int = 32,
    seed=42,
):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data"""

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [24]:
# create train and validation datasets
ds_train, ds_val = make_tf_dataset(df_train, ["interaction"])

In [25]:
iterator = iter(ds_train)
iterator.get_next()

2023-04-20 17:56:12.382256: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [5477]
	 [[{{node Placeholder/_2}}]]


({'user_id': <tf.Tensor: shape=(32,), dtype=int64, numpy=
  array([ 5, 35, 21, 50,  4, 32, 42, 65, 63, 66, 37, 42, 48, 39, 74,  2, 56,
         37, 21, 13, 33, 54, 40, 51, 15,  4, 32, 14, 30, 51, 27,  7])>,
  'item_id': <tf.Tensor: shape=(32,), dtype=int64, numpy=
  array([327,   5, 132, 147, 255, 364,  99, 259, 246,  78, 364, 363, 220,
           2,   5, 387,   8,  20, 247, 272, 402,  78, 365, 382,  59, 336,
         341,   6, 153, 259,  21, 143])>},
 {'interaction': <tf.Tensor: shape=(32,), dtype=int64, numpy=
  array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>})

In [27]:
%%time
# define logs and callbacks
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=0
)

train_hist = ncf_model.fit(
    ds_train,
    # validation_data=ds_val,
    epochs=N_EPOCHS,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1,
)

Epoch 1/10


2023-04-20 17:56:17.192887: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: indices[0] = 327 is not in [0, 109)
	 [[{{node neural_collaborative_filtering/mlp_item_embedding/embedding_lookup}}]]


InvalidArgumentError: Graph execution error:

Detected at node 'neural_collaborative_filtering/mlp_item_embedding/embedding_lookup' defined at (most recent call last):
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 711, in start
      self.io_loop.start()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 530, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_24348/4140083700.py", line 1, in <module>
      get_ipython().run_cell_magic('time', '', '# define logs and callbacks\nlogdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\ntensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)\nearly_stopping_callback = tf.keras.callbacks.EarlyStopping(\n    monitor="val_loss", patience=0\n)\n\ntrain_hist = ncf_model.fit(\n    ds_train,\n    validation_data=ds_val,\n    epochs=N_EPOCHS,\n    callbacks=[tensorboard_callback, early_stopping_callback],\n    verbose=1,\n)\n')
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2475, in run_cell_magic
      result = fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/IPython/core/magics/execution.py", line 1325, in time
      exec(code, glob, local_ns)
    File "<timed exec>", line 8, in <module>
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
      y_pred = self(x, training=True)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/proton/anaconda3/envs/proton/lib/python3.10/site-packages/keras/layers/core/embedding.py", line 272, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'neural_collaborative_filtering/mlp_item_embedding/embedding_lookup'
indices[0] = 327 is not in [0, 109)
	 [[{{node neural_collaborative_filtering/mlp_item_embedding/embedding_lookup}}]] [Op:__inference_train_function_4017]

In [None]:
# long_test = wide_to_long(data["train"], unique_ratings)
# df_test = pd.DataFrame(long_test, columns=["user_id", "item_id", "interaction"])
ds_test, _ = make_tf_dataset(df_test, ["interaction"], val_split=0, seed=None)

In [None]:
%%time
ncf_predictions = ncf_model.predict(ds_test)
df_test["ncf_predictions"] = ncf_predictions

CPU times: user 53.3 ms, sys: 4.04 ms, total: 57.4 ms
Wall time: 57.7 ms


In [None]:
# hide_input
df_test.head()

Unnamed: 0,user_id,item_id,interaction,ncf_predictions
4332,57,202,0,0.326148
5696,71,284,0,0.323948
3611,11,143,0,0.316193
6868,43,347,0,0.339022
5165,65,255,0,0.335409


In [None]:
# hide
# sanity checks
# stop execution if low standard deviation (all recommendations are the same)
std = df_test.describe().loc["std", "ncf_predictions"]
if std < 0.01:
    raise ValueError("Model predictions have standard deviation of less than 1e-2.")

In [None]:
# collapse
df_test["ncf_predictions"] = df_test.pivot(
    index="user_id", columns="item_id", values="ncf_predictions"
).values
print("Neural collaborative filtering predictions")
print(df_test["ncf_predictions"][:10, :4])

Neural collaborative filtering predictions
[[0.33595034 0.33798942        nan        nan]
 [       nan        nan 0.33383664        nan]
 [       nan        nan        nan 0.35074568]
 [       nan        nan        nan 0.34153646]
 [       nan 0.3360399         nan        nan]
 [       nan 0.32989767        nan 0.33652166]
 [       nan        nan 0.3322635  0.34178746]
 [       nan        nan 0.32652438        nan]
 [0.33181915        nan        nan 0.3418683 ]
 [       nan 0.3389582         nan        nan]]


In [None]:
df_test["interaction"].shape

(2698,)

In [None]:
precision_ncf = tf.keras.metrics.Precision(top_k=TOP_K)
recall_ncf = tf.keras.metrics.Recall(top_k=TOP_K)

precision_ncf.update_state(df_test["interaction"], df_test["ncf_predictions"])
recall_ncf.update_state(df_test["interaction"], df_test["ncf_predictions"])
print(
    f"At K = {TOP_K}, we have a precision of {precision_ncf.result().numpy():.5f}",
    f"and a recall of {recall_ncf.result().numpy():.5f}",
)

At K = 5, we have a precision of 0.20000 and a recall of 0.00592
