In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
# import rmm

import numpy as np
import nvtabular as nvt

from sklearn.model_selection import train_test_split

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)

## Read Training Examples

In [4]:
training_examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "retrieval_training.parquet"))
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[3, 10, 9, 17, 18, 3, 9, 19, 3, 4, 5, 6, 6, 9,...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",38,7237
2,4071,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[3, 10, 6, 16, 2, 3, 17, 18, 9, 17, 18, 2, 3, ...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",112,2061
3,7521,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[6, 9, 16, 19, 7, 12, 18, 3, 4, 5, 6, 10, 2, 1...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",247,24542
3,7688,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[7, 9, 17, 18, 2, 9, 19, 6, 16, 8, 20, 6, 10, ...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",17,4240
3,8045,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[7, 15, 18, 2, 18, 6, 7, 18, 4, 17, 6, 7, 9, 7...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",30,9335


## User Feature Engineering

In [5]:
user_features = training_examples.drop(labels="target_item", axis=1)

In [6]:
from datetime import datetime

user_features["datetime"] = \
    np.array([datetime.utcfromtimestamp(values[-1]) for values in user_features["timestamps"].to_pandas()], dtype=object)

user_features["created"] = user_features["datetime"]

In [7]:
user_features

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,datetime,created
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4146,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[3, 10, 9, 17, 18, 3, 9, 19, 3, 4, 5, 6, 6, 9,...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",38,2006-05-17 15:34:04,2006-05-17 15:34:04
2,4071,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[3, 10, 6, 16, 2, 3, 17, 18, 9, 17, 18, 2, 3, ...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",112,2006-03-03 20:32:30,2006-03-03 20:32:30
3,7521,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[6, 9, 16, 19, 7, 12, 18, 3, 4, 5, 6, 10, 2, 1...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",247,2015-08-13 14:14:06,2015-08-13 14:14:06
3,7688,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[7, 9, 17, 18, 2, 9, 19, 6, 16, 8, 20, 6, 10, ...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",17,2016-01-27 14:16:35,2016-01-27 14:16:35
3,8045,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[7, 15, 18, 2, 18, 6, 7, 18, 4, 17, 6, 7, 9, 7...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",30,2017-01-18 16:19:12,2017-01-18 16:19:12
...,...,...,...,...,...,...,...,...
162538,7513,"[33543, 69107, 56595, 55603, 11943, 57331, 442...","[6, 9, 16, 19, 7, 12, 18, 3, 6, 17, 9, 19, 6, ...","[1438780751, 1438780754, 1438780759, 143878083...","[352, 586, 1238, 523, 2234, 2767, 2671, 4888, ...",82,2015-08-05 14:39:05,2015-08-05 14:39:05
162539,2378,"[28232, 44028, 71223, 35598, 42387, 39902, 466...","[9, 16, 2, 3, 17, 18, 6, 9, 16, 4, 5, 10, 14, ...","[995149720, 995149760, 995149788, 995149788, 9...","[1656, 476, 1708, 1250, 1169, 601, 3270, 1934,...",42,2001-07-14 22:46:04,2001-07-14 22:46:04
162540,5315,"[69574, 64554, 48548, 17376, 18806, 28681, 198...","[18, 6, 9, 18, 6, 10, 3, 4, 5, 6, 13, 18, 2, 9...","[1248854959, 1248855507, 1248855584, 124885572...","[2914, 11237, 5293, 11698, 11209, 4064, 10658,...",50,2009-07-29 09:59:28,2009-07-29 09:59:28
162540,5317,"[63961, 43529, 56350, 28566, 41905, 33135, 44182]","[2, 3, 9, 12, 18, 3, 4, 5, 6, 2, 3, 4, 5, 6, 9...","[1249028189, 1249028584, 1249028593, 124902967...","[13456, 6259, 8247, 4616, 8243, 11688, 10245]",7,2009-07-31 08:42:04,2009-07-31 08:42:04


In [8]:
user_features.reset_index(inplace=True)
user_features.drop(labels="day", axis=1)

Unnamed: 0,user_id,search_terms,genres,timestamps,movie_ids,movie_id_count,datetime,created
0,1,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[3, 10, 9, 17, 18, 3, 9, 19, 3, 4, 5, 6, 6, 9,...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",38,2006-05-17 15:34:04,2006-05-17 15:34:04
1,2,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[3, 10, 6, 16, 2, 3, 17, 18, 9, 17, 18, 2, 3, ...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",112,2006-03-03 20:32:30,2006-03-03 20:32:30
2,3,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[6, 9, 16, 19, 7, 12, 18, 3, 4, 5, 6, 10, 2, 1...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",247,2015-08-13 14:14:06,2015-08-13 14:14:06
3,3,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[7, 9, 17, 18, 2, 9, 19, 6, 16, 8, 20, 6, 10, ...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",17,2016-01-27 14:16:35,2016-01-27 14:16:35
4,3,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[7, 15, 18, 2, 18, 6, 7, 18, 4, 17, 6, 7, 9, 7...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",30,2017-01-18 16:19:12,2017-01-18 16:19:12
...,...,...,...,...,...,...,...,...
541867,162538,"[33543, 69107, 56595, 55603, 11943, 57331, 442...","[6, 9, 16, 19, 7, 12, 18, 3, 6, 17, 9, 19, 6, ...","[1438780751, 1438780754, 1438780759, 143878083...","[352, 586, 1238, 523, 2234, 2767, 2671, 4888, ...",82,2015-08-05 14:39:05,2015-08-05 14:39:05
541868,162539,"[28232, 44028, 71223, 35598, 42387, 39902, 466...","[9, 16, 2, 3, 17, 18, 6, 9, 16, 4, 5, 10, 14, ...","[995149720, 995149760, 995149788, 995149788, 9...","[1656, 476, 1708, 1250, 1169, 601, 3270, 1934,...",42,2001-07-14 22:46:04,2001-07-14 22:46:04
541869,162540,"[69574, 64554, 48548, 17376, 18806, 28681, 198...","[18, 6, 9, 18, 6, 10, 3, 4, 5, 6, 13, 18, 2, 9...","[1248854959, 1248855507, 1248855584, 124885572...","[2914, 11237, 5293, 11698, 11209, 4064, 10658,...",50,2009-07-29 09:59:28,2009-07-29 09:59:28
541870,162540,"[63961, 43529, 56350, 28566, 41905, 33135, 44182]","[2, 3, 9, 12, 18, 3, 4, 5, 6, 2, 3, 4, 5, 6, 9...","[1249028189, 1249028584, 1249028593, 124902967...","[13456, 6259, 8247, 4616, 8243, 11688, 10245]",7,2009-07-31 08:42:04,2009-07-31 08:42:04


In [9]:
user_features.dtypes

user_id                    int32
day                        int64
search_terms                list
genres                      list
timestamps                  list
movie_ids                   list
movie_id_count             int32
datetime          datetime64[us]
created           datetime64[us]
dtype: object

In [10]:
user_features.to_parquet(os.path.join(INPUT_DATA_DIR, "user_features.parquet"))