In [1]:
# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions anda
# limitations under the License.
# ==============================================================================

<img src="https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_models_entertainment-with-pretrained-embeddings/nvidia_logo.png" style="width: 90px; float: right;">

# Transformer-based architecture for next-item prediction task

## Overview

In this use case we will train a Transformer-based architecture for next-item prediction task.

We will use the [booking.com dataset](https://github.com/bookingcom/ml-dataset-mdt) to train a session-based model. The dataset contains 1,166,835 of anonymized hotel reservations in the train set and 378,667 in the test set. Each reservation is a part of a customer's trip (identified by `utrip_id`) which includes consecutive reservations.

We will reshape the data to organize it into 'sessions'. Each session will be a full customer itinerary in chronological order. The goal will be to predict the city_id of the final reservation of each trip.


### Learning objectives

- Training a Transformer-based architecture for next-item prediction task

## Downloading and preparing the dataset

You can download the full dataset from GitHub [here](https://github.com/bookingcom/ml-dataset-mdt). Please place it alognside this notebook (or alternatively, change the `DATAPATH` to point to where it is located).

In [1]:
from merlin.core.dispatch import get_lib
import numpy as np

DATAPATH = 'ml-dataset-mdt'

itineraries = get_lib().read_csv(f'{DATAPATH}/train_set.csv', parse_dates=['checkin'])

In [2]:
itineraries = itineraries.sort_values(['utrip_id', 'checkout'])

In [10]:
utrip_ids = itineraries.utrip_id.unique().sample(frac=1)
len(utrip_ids)

217686

In [15]:
train_set_utrip_ids = utrip_ids[:160_000]
validation_set_utrip_ids = utrip_ids[160_000:]

In [31]:
train_set = itineraries[itineraries.utrip_id.isin(train_set_utrip_ids)]
validation_set = itineraries[itineraries.utrip_id.isin(validation_set_utrip_ids)]

In [32]:
from nvtabular import *
from nvtabular import ops

In [68]:
train_set_dataset = Dataset(train_set)
validation_set_dataset = Dataset(validation_set)

In [59]:
(itineraries.groupby('utrip_id')['user_id'].count() < 11).mean()

0.9712246079215016

In [58]:
(itineraries.groupby('utrip_id')['user_id'].count() < 3).mean()

0.0013184127596630008

In [70]:
# Truncate sequence features to first interacted 20 items 
MAX_TRIP_LENGTH = 10
MIN_TRIP_LENGTH = 2

In [71]:
groupby_features = train_set.columns >> ops.Groupby(
    groupby_cols=['utrip_id'],
    sort_cols=['checkin'],
    aggs={
        'city_id': ['list']
    }
)

In [99]:
groupby_features_truncated = groupby_features['city_id_list'] >> ops.ListSlice(0, MAX_TRIP_LENGTH, pad=True)

In [100]:
wf = Workflow(groupby_features_truncated)

In [101]:
train_set_processed = wf.fit_transform(train_set_dataset)
validation_set_processed = wf.fit_transform(validation_set_dataset)

In [102]:
train_set_processed.compute()

Unnamed: 0,city_id_list
0,"[8183, 15626, 60902, 30628, 0, 0, 0, 0, 0, 0]"
1,"[38677, 52089, 21328, 27485, 38677, 0, 0, 0, 0..."
2,"[64876, 55128, 9608, 31817, 36170, 58178, 3606..."
3,"[17127, 31088, 40521, 55128, 21033, 6306, 6788..."
4,"[62541, 42482, 20345, 33540, 32627, 0, 0, 0, 0..."
...,...
159995,"[64876, 1766, 50797, 1766, 0, 0, 0, 0, 0, 0]"
159996,"[36063, 43306, 11481, 15626, 36063, 0, 0, 0, 0..."
159997,"[17775, 66634, 17775, 17775, 0, 0, 0, 0, 0, 0]"
159998,"[382, 38509, 18930, 38509, 51145, 11179, 61881..."
