In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
# import rmm

import numpy as np
import nvtabular as nvt

from sklearn.model_selection import train_test_split

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)

## Read Training Examples

In [4]:
training_examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "training_examples.parquet"))
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[9, 12, 18, 4, 5, 10, 14, 8, 9, 9, 12, 18, 9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[9, 12, 9, 7, 9, 16, 18, 7, 9, 9, 12, 17, 9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[2, 6, 2, 6, 7, 3, 9, 3, 4, 5, 6, 10, 2, 9, 17...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",220,37729
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[9, 2, 17, 18, 6, 6, 9, 9, 8, 1, 8, 9, 10, 12,...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",18,4344
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[9, 9, 8, 9, 2, 3, 6, 7, 9, 16, 6, 7, 9, 18, 7...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773


## User Feature Engineering

In [5]:
user_features = training_examples.drop(labels="target_item", axis=1)

In [6]:
from datetime import datetime

user_features["datetime"] = \
    np.array([datetime.utcfromtimestamp(values[-1]) for values in user_features["timestamps"].to_pandas()], dtype=object)

user_features["created"] = user_features["datetime"]

In [7]:
user_features

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,datetime,created
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[9, 12, 18, 4, 5, 10, 14, 8, 9, 9, 12, 18, 9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,2006-05-17 15:34:04,2006-05-17 15:34:04
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[9, 12, 9, 7, 9, 16, 18, 7, 9, 9, 12, 17, 9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2006-03-03 20:32:30,2006-03-03 20:32:30
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[2, 6, 2, 6, 7, 3, 9, 3, 4, 5, 6, 10, 2, 9, 17...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",220,2015-08-13 14:13:50,2015-08-13 14:13:50
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[9, 2, 17, 18, 6, 6, 9, 9, 8, 1, 8, 9, 10, 12,...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",18,2016-01-27 14:16:15,2016-01-27 14:16:15
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[9, 9, 8, 9, 2, 3, 6, 7, 9, 16, 6, 7, 9, 18, 7...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,2017-01-18 16:19:12,2017-01-18 16:19:12
...,...,...,...,...,...,...,...,...
162538,7513,"[41239, 47665, 31465, 43799, 50816, 51176, 478...","[2, 6, 2, 6, 7, 3, 9, 5, 9, 2, 3, 5, 10, 15, 1...","[1438780751, 1438780754, 1438780759, 143878083...","[356, 593, 1270, 527, 2324, 2858, 2762, 4993, ...",77,2015-08-05 14:38:26,2015-08-05 14:38:26
162539,2378,"[49414, 39232, 16948, 18757, 72236, 43054, 354...","[6, 7, 9, 15, 18, 3, 5, 9, 9, 8, 6, 9, 9, 16, ...","[995149720, 995149760, 995149788, 995149788, 9...","[1721, 480, 1784, 1198, 1282, 608, 3363, 2022,...",42,2001-07-14 22:46:04,2001-07-14 22:46:04
162540,5315,"[62625, 0, 65488, 0, 0, 22885, 43579, 2816, 0,...","[6, 9, 16, 6, 9, 2, 7, 18, 1, 6, 16, 9, 6, 6, ...","[1248854959, 1248855507, 1248855584, 124885572...","[3005, 49530, 5400, 54001, 49278, 4167, 4372, ...",43,2009-07-29 09:59:52,2009-07-29 09:59:52
162540,5317,"[15181, 38580, 59724, 54481, 0]","[9, 7, 9, 18, 6, 9, 7, 9, 18, 5, 9]","[1249028584, 1249028593, 1249029673, 124902967...","[6377, 8961, 4720, 8957, 53953]",5,2009-07-31 08:41:59,2009-07-31 08:41:59


In [8]:
user_features.reset_index(inplace=True)
user_features.drop(labels="day", axis=1)

Unnamed: 0,user_id,search_terms,genres,timestamps,movie_ids,movie_id_count,datetime,created
0,1,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[9, 12, 18, 4, 5, 10, 14, 8, 9, 9, 12, 18, 9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,2006-05-17 15:34:04,2006-05-17 15:34:04
1,2,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[9, 12, 9, 7, 9, 16, 18, 7, 9, 9, 12, 17, 9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2006-03-03 20:32:30,2006-03-03 20:32:30
2,3,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[2, 6, 2, 6, 7, 3, 9, 3, 4, 5, 6, 10, 2, 9, 17...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",220,2015-08-13 14:13:50,2015-08-13 14:13:50
3,3,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[9, 2, 17, 18, 6, 6, 9, 9, 8, 1, 8, 9, 10, 12,...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",18,2016-01-27 14:16:15,2016-01-27 14:16:15
4,3,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[9, 9, 8, 9, 2, 3, 6, 7, 9, 16, 6, 7, 9, 18, 7...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,2017-01-18 16:19:12,2017-01-18 16:19:12
...,...,...,...,...,...,...,...,...
532977,162538,"[41239, 47665, 31465, 43799, 50816, 51176, 478...","[2, 6, 2, 6, 7, 3, 9, 5, 9, 2, 3, 5, 10, 15, 1...","[1438780751, 1438780754, 1438780759, 143878083...","[356, 593, 1270, 527, 2324, 2858, 2762, 4993, ...",77,2015-08-05 14:38:26,2015-08-05 14:38:26
532978,162539,"[49414, 39232, 16948, 18757, 72236, 43054, 354...","[6, 7, 9, 15, 18, 3, 5, 9, 9, 8, 6, 9, 9, 16, ...","[995149720, 995149760, 995149788, 995149788, 9...","[1721, 480, 1784, 1198, 1282, 608, 3363, 2022,...",42,2001-07-14 22:46:04,2001-07-14 22:46:04
532979,162540,"[62625, 0, 65488, 0, 0, 22885, 43579, 2816, 0,...","[6, 9, 16, 6, 9, 2, 7, 18, 1, 6, 16, 9, 6, 6, ...","[1248854959, 1248855507, 1248855584, 124885572...","[3005, 49530, 5400, 54001, 49278, 4167, 4372, ...",43,2009-07-29 09:59:52,2009-07-29 09:59:52
532980,162540,"[15181, 38580, 59724, 54481, 0]","[9, 7, 9, 18, 6, 9, 7, 9, 18, 5, 9]","[1249028584, 1249028593, 1249029673, 124902967...","[6377, 8961, 4720, 8957, 53953]",5,2009-07-31 08:41:59,2009-07-31 08:41:59


In [9]:
user_features.to_parquet(os.path.join(INPUT_DATA_DIR, "user_features.parquet"))