In [None]:
# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions anda
# limitations under the License.
# ==============================================================================

Run the below cell once in a new TF 22.09 container

In [None]:
# %%bash

# cd /models && git pull && pip install .
# cd /nvtabular && git pull && pip install .
# cd /core && git pull && pip install .
# cd /systems && git pull && pip install .

# pip install tensorflow
# pip install transformers==4.21

In [None]:
# I do not need this on my machine, but can be helpful if you encounter issues

# import os
# os.environ["FORCE_TF_AVAILABLE"]="True"

<img src="https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_models_entertainment-with-pretrained-embeddings/nvidia_logo.png" style="width: 90px; float: right;">

# Transformer-based architecture for next-item prediction task

## Overview

In this use case we will train a Transformer-based architecture for next-item prediction task.

We will use the [booking.com dataset](https://github.com/bookingcom/ml-dataset-mdt) to train a session-based model. The dataset contains 1,166,835 of anonymized hotel reservations in the train set and 378,667 in the test set. Each reservation is a part of a customer's trip (identified by `utrip_id`) which includes consecutive reservations.

We will reshape the data to organize it into 'sessions'. Each session will be a full customer itinerary in chronological order. The goal will be to predict the city_id of the final reservation of each trip.


### Learning objectives

- Training a Transformer-based architecture for next-item prediction task

## Downloading and preparing the dataset

You can download the full dataset from GitHub [here](https://github.com/bookingcom/ml-dataset-mdt). Please place it alognside this notebook (or alternatively, change the `DATAPATH` to point to where it is located).

In [None]:
from merlin.core.dispatch import get_lib
import numpy as np

DATAPATH = 'ml-dataset-mdt'

itineraries = get_lib().read_csv(f'{DATAPATH}/train_set.csv', parse_dates=['checkin'])

Each reservation has a unique `utrip_id`. During each trip a customer vists several destinations.

In [None]:
itineraries.head()

We will limit the sequence length to between 2 and 10 trips. That will capture upwards of 95% datapoints!

We don't want to train on trips that are shorter than two hops -- our model would not be able to learn much from such sequences. Additionally, such short sequences are uncharacteristically short for this dataset.

Besides, training on unusually long or short sequences, that are far outside of the most common sequence length, might not be the best use of our compute resources.

In [None]:
MAX_TRIP_LENGTH = 10
MIN_TRIP_LENGTH = 2

Let us now split the data into a train and validation set based on trip ID.

In [None]:
utrip_ids = itineraries.utrip_id.unique().sample(frac=1)
len(utrip_ids)

In [None]:
train_set_utrip_ids = utrip_ids[:160_000]
validation_set_utrip_ids = utrip_ids[160_000:]

train_set = itineraries[itineraries.utrip_id.isin(train_set_utrip_ids)]
validation_set = itineraries[itineraries.utrip_id.isin(validation_set_utrip_ids)]

We can now begin with data preprocessing.

We will combine trips into "sessions", discard trips that are either too short or too long and calculate total trip length in stops.

We will use nvtabular for this work. It offers optimized tabular data preprocessing operators that run on the GPU. If you would like to learn more about this software library, please take a look [here](https://github.com/NVIDIA-Merlin/NVTabular).

In [None]:
from nvtabular import *
from nvtabular import ops
from merlin.models.tf import Loader

from merlin.schema.tags import Tags

In [None]:
train_set_dataset = Dataset(train_set)
validation_set_dataset = Dataset(validation_set)

In [None]:
groupby_features = ['city_id', 'booker_country', 'utrip_id', 'checkin'] >> ops.Groupby(
    groupby_cols=['utrip_id'],
    sort_cols=['checkin'],
    aggs={
        'city_id': ['list', 'count'],
        'booker_country': ['list']
    }
)

groupby_features_truncated_city = groupby_features['city_id_list'] >> ops.Categorify() >> ops.ListSlice(0, MAX_TRIP_LENGTH, pad=True) >> ops.AddTags([Tags.SEQUENCE, Tags.ITEM, Tags.ITEM_ID])
groupby_features_truncated_country = groupby_features['booker_country_list'] >> ops.Categorify() >> ops.ListSlice(0, MAX_TRIP_LENGTH, pad=True) >> ops.AddTags([Tags.SEQUENCE, Tags.ITEM])
city_id_count = groupby_features['city_id_count'] >> ops.AddTags([Tags.CONTEXT, Tags.ITEM, Tags.CONTINUOUS])

In [None]:
wf = Workflow(groupby_features_truncated_city + groupby_features_truncated_country + city_id_count)

In [None]:
train_set_processed = wf.fit_transform(train_set_dataset)
validation_set_processed = wf.fit_transform(validation_set_dataset)

Our data consists of a sequence of visited `city_ids`, a sequence of `booker_countries` (represented as integer categories) and a `city_id_count` column (which contains the count of visited cities in a trip).

In [None]:
train_set_processed.compute().head()

We are now ready to train our model.

In [None]:
import merlin.models.tf as mm

Let's identify two schemas. The first one for sequential features, the other for context features (`city_id_count`) that we will broadcast to the entire sequence.

In [None]:
seq_schema = train_set_processed.schema.select_by_tag(Tags.SEQUENCE)
context_schema = train_set_processed.schema.select_by_tag(Tags.CONTEXT)

Let's also identify the target column.

In [None]:
target = train_set_processed.schema.select_by_tag(Tags.SEQUENCE).column_names[0]

In [None]:
train_set_processed.compute()

In [None]:
train_set_processed.schema.select_by_tag(Tags.SEQUENCE)

In [None]:
train_set_processed.schema.select_by_tag(Tags.CONTEXT)

In [None]:
loader = Loader(train_set_processed, batch_size=1024, shuffle=True)

In [None]:
# next(iter(loader))

In [None]:
from merlin.models.tf.transforms.features import BroadcastToSequence

In [None]:
context_schema

In [None]:
seq_schema

## Without broadcasting of context features

In [None]:
model = mm.Model(
    mm.InputBlockV2(
        seq_schema,
        embeddings=mm.Embeddings(
            train_set_processed.schema.select_by_tag(Tags.CATEGORICAL), sequence_combiner=None
        ),
    ),

    mm.GPT2Block(d_model=40, n_head=4, n_layer=2, pre=mm.ReplaceMaskedEmbeddings()),
    mm.CategoricalOutput(
        train_set_processed.schema.select_by_name(target),
        default_loss="categorical_crossentropy",
    ),
)

In [None]:
model.compile(run_eagerly=False, optimizer='adam', loss="categorical_crossentropy")
model.fit(loader, pre=mm.SequenceMaskRandom(schema=seq_schema, target=target, masking_prob=0.3))

## Specifying correct input_dimensions (before broadcasting) in model constructor (d_model=40)

In [None]:
model = mm.Model(
    mm.InputBlockV2(
        train_set_processed.schema,
        embeddings=mm.Embeddings(
            train_set_processed.schema.select_by_tag(Tags.CATEGORICAL), sequence_combiner=None
        ),
        post=BroadcastToSequence(context_schema, seq_schema)
    ),

    mm.GPT2Block(d_model=40, n_head=4, n_layer=2, pre=mm.ReplaceMaskedEmbeddings()),
    mm.CategoricalOutput(
        train_set_processed.schema.select_by_name(target),
        default_loss="categorical_crossentropy",
    ),
)

In [None]:
model.compile(run_eagerly=True, optimizer='adam', loss="categorical_crossentropy")
model.fit(loader, pre=mm.SequenceMaskRandom(schema=train_set_processed.schema, target=target, masking_prob=0.3))

## Specifying correct input_dimensions (after broadcasting) in model constructor (d_model=41)

In [None]:
model = mm.Model(
    mm.InputBlockV2(

        train_set_processed.schema,
        embeddings=mm.Embeddings(
            train_set_processed.schema.select_by_tag(Tags.CATEGORICAL), sequence_combiner=None
        ),
        post=BroadcastToSequence(context_schema, seq_schema)
    ),

    mm.GPT2Block(d_model=41, n_head=4, n_layer=2, pre=mm.ReplaceMaskedEmbeddings()),
    mm.CategoricalOutput(
        train_set_processed.schema.select_by_name(target),
        default_loss="categorical_crossentropy",
    ),
)

In [None]:
model.compile(run_eagerly=True, optimizer='adam', loss="categorical_crossentropy")
model.fit(loader, pre=mm.SequenceMaskRandom(schema=train_set_processed.schema, target=target, masking_prob=0.3))

In [None]:
# first stab at evaluation

loader_eval = Loader(validation_set_processed, batch_size=1024, shuffle=False)

In [None]:
model.evaluate(loader_eval, batch_size=1024, pre=mm.SequenceMaskLast(schema=train_set_processed.schema, target=target))