# 02-02 : LSTM with custom metrics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

import tensorflow as tf
import tensorflow_recommenders as tfrs

from keras import layers
from keras import losses
from keras import optimizers
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

2024-03-04 14:05:48.636366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 14:05:48.636391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 14:05:48.637148: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-04 14:05:48.641050: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. The dataset

We will use the RetailRocket source dataset as prepared for the GRU4Rec paper:
https://github.com/JohnnyFoulds/GRU4Rec/blob/master/notebooks/01_%20preprocess/01-02_retailrocket.ipynb

The dataset is already split into training, validation, and test sets as tab separated files. The columns are:

- `SessionId` - the id of the session. In one session there are one or many items.
- `ItemId` - the id of the item.
- `Time` - the event time.

In [2]:
data_path = '../../data/RetailRocket'
model_path = '../../models/RetailRocket'

# file paths for the data files
train_path = f'{data_path}/retailrocket_processed_view_train_tr.tsv'
validation_path = f'{data_path}/retailrocket_processed_view_train_valid.tsv'
test_path = f'{data_path}/retailrocket_processed_view_test.tsv'

In [3]:
# load the datasets
df_train = pd.read_csv(train_path, sep='\t').sample(frac=0.3, random_state=42)
df_validation = pd.read_csv(validation_path, sep='\t')
df_test = pd.read_csv(test_path, sep='\t')

In [4]:
# head of the training set
display(df_train.head())

Unnamed: 0,SessionId,ItemId,Time
150593,363804,451942,1433134557529
700326,1684469,441756,1440134153810
673447,1615632,357925,1434672250853
48855,117260,2129,1435772219980
515183,1231963,7804,1432004462904


## 2. Preparing the dataset

In [5]:
# convert the items ids to strings for tokenization
df_train['ItemId'] = df_train['ItemId'].astype(str)
df_validation['ItemId'] = df_validation['ItemId'].astype(str)
df_test['ItemId'] = df_test['ItemId'].astype(str)

### 2.1 Sequence Creation

The first step involves creating sequences of item interactions for each session. This requires grouping the data by SessionId and ordering it within each group based on the Time column. Each sequence represents a series of item interactions within a session.

In [6]:
# Sort by SessionId and Time to ensure the order is correct
df_train_sorted = df_train.sort_values(by=['SessionId', 'Time'])
df_validation_sorted = df_validation.sort_values(by=['SessionId', 'Time'])
df_test_sorted = df_test.sort_values(by=['SessionId', 'Time'])

# Create sequences of ItemIds grouped by SessionId
train_sequences = df_train_sorted.groupby('SessionId')['ItemId'].apply(list)
validation_sequences = df_validation_sorted.groupby('SessionId')['ItemId'].apply(list)
test_sequences = df_test_sorted.groupby('SessionId')['ItemId'].apply(list)

In [7]:
# drop the sessions with only one item
train_sequences = train_sequences[train_sequences.map(len) > 1]

### 2.2 Tokenization (Categorical Features Encoding)

We need to ensure that ItemIds are treated as categorical inputs.

Create a tokenizer to encode ItemIds as integers, 0 and 1 are special values, where 0 should be for padding and 1 for out of vocabulary items.

In [8]:
# get a list of the unique item ids across all datasets
unique_items = pd.concat([df_train, df_validation, df_test]).ItemId.unique()

# use keras to map the item ids to a sequential list of integer values,
# 0 should be for padding and 1 for out of vocabulary items
tokenizer = Tokenizer(num_words=len(unique_items) + 2, oov_token=1)
tokenizer.fit_on_texts(unique_items)

# save the tokenizer
tokenizer_path = f'{model_path}/item_id_tokenizer.json'
with open(tokenizer_path, 'w') as file:
    file.write(tokenizer.to_json())

In [9]:
# tokenize the sequences
train_sequences_tokenized = tokenizer.texts_to_sequences(train_sequences)
validation_sequences_tokenized = tokenizer.texts_to_sequences(validation_sequences)
test_sequences_tokenized = tokenizer.texts_to_sequences(test_sequences)

### 2.3 Padding

To handle sessions of varying lengths, we'll need to pad the sequences so that they all have the same length, making them suitable for batch processing.

In [10]:
# use the last item as the target and the rest as the input
def split_input_target(sequence):
    return sequence[:-1], sequence[-1]

train_sequences_input = list(map(split_input_target, train_sequences_tokenized))
validation_sequences_input = list(map(split_input_target, validation_sequences_tokenized))
test_sequences_input = list(map(split_input_target, test_sequences_tokenized))

In [11]:
# separate into input and target arrays
train_input, y_train = map(list, zip(*train_sequences_input))
validation_input, y_validation = map(list, zip(*validation_sequences_input))
test_input, y_test = map(list, zip(*test_sequences_input))

In [12]:
# Determine the maximum sequence length for padding
max_sequence_length = 10

# pad the sequences
X_train = pad_sequences(train_input, maxlen=max_sequence_length, padding='pre')
X_validation = pad_sequences(validation_input, maxlen=max_sequence_length, padding='pre')
X_test = pad_sequences(test_input, maxlen=max_sequence_length, padding='pre')

### 2.4 Target Encoding

The target values needs to be one-hot encoded.

In [13]:
# get the number of classes from the tokenizer
num_classes = len(tokenizer.word_index) + 1

In [14]:
# convert y values to categorical
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_validation_categorical = to_categorical(y_validation, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

## 3. Model Architecture

In [15]:
# Define the model architecture
model = Sequential([
    layers.Embedding(input_dim=num_classes, output_dim=64), # Set input_dim to the number of unique items
    layers.LSTM(128),
    layers.Dense(units=num_classes, activation='softmax') # Set the number of units to the number of unique items
])

2024-03-04 14:05:51.783069: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-04 14:05:51.808764: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-04 14:05:51.808955: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [16]:
# Custom Recall@k metric
def recall_at_k(y_true, y_pred, k):
    y_true_reshaped = tf.reshape(y_true, (len(y_true), -1))
    _, top_k_indices = tf.nn.top_k(y_pred, k)
    y_true_one_hot = tf.one_hot(tf.cast(y_true_reshaped, tf.int32), depth=y_pred.shape[1])
    top_k_preds = tf.batch_gather(y_true_one_hot, top_k_indices)
    recall_score = tf.math.reduce_any(tf.cast(top_k_preds, tf.bool), axis=-1)
    return tf.reduce_mean(tf.cast(recall_score, tf.float32))

# Custom MRR@k metric
def mrr_at_k(y_true, y_pred, k):
    y_true_reshaped = tf.reshape(y_true, (len(y_true), -1))
    _, top_k_indices = tf.nn.top_k(y_pred, k)
    true_indices = tf.where(tf.equal(top_k_indices, tf.cast(y_true_reshaped, tf.int64)))
    reciprocal_ranks = 1 / (tf.cast(true_indices[:, -1], tf.float32) + 1)
    valid_ranks_mask = tf.less(true_indices[:, -1], k)
    valid_reciprocal_ranks = tf.boolean_mask(reciprocal_ranks, valid_ranks_mask)
    return tf.reduce_mean(valid_reciprocal_ranks)

In [17]:
optimizer = optimizers.Adam()
loss_object = losses.CategoricalCrossentropy()

@tf.function
def train_step(model, x, y):
    with tf.GradientTape() as tape:
        predictions = model(x, training=True)
        loss = loss_object(y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    recall = recall_at_k(y, predictions, k=5)  # Example for k=5
    mrr = mrr_at_k(y, predictions, k=5)
    return loss, recall, mrr

In [18]:
# Convert X_train and y_train_categorical to a TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_categorical))
train_dataset = train_dataset.batch(16)  # Example batch size, adjust as necessary

2024-03-04 14:06:08.080199: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 6.28GiB (rounded to 6739148800)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-03-04 14:06:08.080229: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-03-04 14:06:08.080244: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 12, Chunks in use: 12. 3.0KiB allocated for chunks. 3.0KiB in use in bin. 52B client-requested in use in bin.
2024-03-04 14:06:08.080254: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-03-04 14:06:08.080264: I external/l

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
# Example training loop (adjust according to your dataset)
epochs = 10
for epoch in range(epochs):
    # Assuming train_dataset is a tf.data.Dataset object prepared by you
    # This will iterate over batches (x_batch, y_batch)
    for x_batch, y_batch in train_dataset:
        loss, recall, mrr = train_step(model, x_batch, y_batch)
        print(f"Epoch {epoch}, Loss: {loss.numpy()}, Recall@5: {recall.numpy()}, MRR@5: {mrr.numpy()}")