In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
# import rmm

import numpy as np
import nvtabular as nvt

from sklearn.model_selection import train_test_split

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/end-to-end-poc/data/")
)

## Read Movie and Rating Features

In [4]:
movie_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "movie_features.parquet"))
movie_features.head()

Unnamed: 0,movieId,tags_unique,genres,tags_nunique
0,1,"[477, 581, 640, 1857, 2175, 2817, 3538, 4395, ...","[3, 4, 5, 6, 10]",126
1,2,"[1206, 4448, 5069, 5213, 7883, 8912, 9116, 925...","[3, 5, 10]",44
2,3,"[2196, 4356, 4448, 6484, 11460, 12274, 17647, ...","[6, 16]",23
3,4,"[4448, 31525, 34749, 34981, 39134, 48169, 6086...","[6, 9, 16]",8
4,5,"[4448, 5693, 6977, 8912, 23756, 25354, 28604, ...",[6],20


In [5]:
ratings_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "ratings_features.parquet"))
ratings_features.head()

Unnamed: 0,day,interaction,userId,movieId,rating,timestamp
0,4146,True,1,296,5.0,1147880044
1,4146,True,1,306,3.5,1147868817
2,4146,True,1,307,5.0,1147868828
3,4146,True,1,665,5.0,1147878820
4,4146,True,1,899,3.5,1147868510


## Join Ratings With Features

In [6]:
joined_features = movie_features.merge(ratings_features)
joined_features.head()

Unnamed: 0,movieId,tags_unique,genres,tags_nunique,day,interaction,userId,rating,timestamp
0,1,"[477, 581, 640, 1857, 2175, 2817, 3538, 4395, ...","[3, 4, 5, 6, 10]",126,4449,True,160,4.5,1174081897
1,10,"[187, 188, 189, 2817, 3416, 3862, 3911, 4448, ...","[2, 3, 18]",66,4449,True,160,4.0,1174082605
2,27156,"[7739, 18057]","[3, 5, 6, 7]",2,8378,True,235,5.0,1513481086
3,31658,"[1135, 2649, 5791, 8723, 22401, 23840, 23943]",[15],7,8358,True,235,5.0,1511746293
4,33679,[57264],[9],1,8372,True,235,5.0,1512965368


In [7]:
sampled_indices = np.array([np.random.randint(0,nunique) for nunique in joined_features["tags_nunique"].to_pandas()], dtype=np.int32)

In [8]:
sampled_tags = np.array([tags[index] for index, tags in zip(sampled_indices, joined_features["tags_unique"].to_pandas())])

In [9]:
joined_features["sampled_tag"] = sampled_tags

In [10]:
joined_features = joined_features.drop(labels=["tags_unique", "tags_nunique", "interaction", "rating"], axis=1)

In [11]:
joined_features = joined_features.sort_values("timestamp")

In [12]:
joined_features.head()

Unnamed: 0,movieId,genres,day,userId,timestamp,sampled_tag
153541,47,"[15, 18]",0,2262,789652009,25510
999205,57,"[9, 15]",385,13424,822873600,256
999227,11,"[6, 9, 16]",385,13424,822873600,62731
3288114,32,"[15, 17, 18]",385,42937,822873600,69574
7845659,18,[6],385,102689,822873600,67234


In [13]:
grouped_examples = joined_features.groupby(["userId", "day"]).agg(
    {
        "sampled_tag": "collect",
        "genres": "collect",
        "timestamp": "collect",
        "movieId": ["collect", "count"]
    }
)

In [14]:
grouped_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,genres,timestamp,movieId,movieId
Unnamed: 0_level_1,Unnamed: 1_level_1,collect,collect,collect,collect,count
userId,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,4146,"[43870, 30243, 31462, 25871, 28458, 44763, 371...","[[6], [6], [3, 6, 17], [9, 16], [3, 9], [2, 3,...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",53
2,4071,"[47665, 65105, 38902, 64349, 1452, 57559, 6574...","[[6], [6], [6], [6], [6, 9], [9], [7, 9, 15], ...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",125
3,7521,"[21601, 35319, 30185, 10741, 52728, 39667, 661...","[[6, 9, 16], [9], [2, 6, 7, 9, 17], [3, 4, 5, ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",179
3,7688,"[28185, 51431, 45744, 50807, 67439, 56292, 467...","[[12], [12, 18], [7, 9, 18], [7, 9], [6, 9], [...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",10
3,8045,"[15910, 13153, 52882, 41183, 63025, 72409, 107...","[[7, 9, 17, 18], [7, 9, 18], [7, 9], [9], [7, ...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",22


In [15]:
# TODO: Rename the columns

In [16]:
grouped_examples.columns

MultiIndex([('sampled_tag', 'collect'),
            (     'genres', 'collect'),
            (  'timestamp', 'collect'),
            (    'movieId', 'collect'),
            (    'movieId',   'count')],
           )

In [17]:
grouped_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,genres,timestamp,movieId,movieId
Unnamed: 0_level_1,Unnamed: 1_level_1,collect,collect,collect,collect,count
userId,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,4146,"[43870, 30243, 31462, 25871, 28458, 44763, 371...","[[6], [6], [3, 6, 17], [9, 16], [3, 9], [2, 3,...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",53
2,4071,"[47665, 65105, 38902, 64349, 1452, 57559, 6574...","[[6], [6], [6], [6], [6, 9], [9], [7, 9, 15], ...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",125
3,7521,"[21601, 35319, 30185, 10741, 52728, 39667, 661...","[[6, 9, 16], [9], [2, 6, 7, 9, 17], [3, 4, 5, ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",179
3,7688,"[28185, 51431, 45744, 50807, 67439, 56292, 467...","[[12], [12, 18], [7, 9, 18], [7, 9], [6, 9], [...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",10
3,8045,"[15910, 13153, 52882, 41183, 63025, 72409, 107...","[[7, 9, 17, 18], [7, 9, 18], [7, 9], [9], [7, ...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",22


In [18]:
len(grouped_examples[grouped_examples[("movieId", "count")] > 1])

498253

In [19]:
# TODO: Rename columns from tuples to strings

In [20]:
grouped_examples["target_item"] = grouped_examples[("movieId", "collect")].list.get(-1)

In [21]:
grouped_examples[("sampled_tag", "collect")] = \
    np.array([values[:-1] for values in grouped_examples[("sampled_tag", "collect")].to_pandas()], dtype=object)

In [22]:
grouped_examples[("genres", "collect")] = \
    np.array([values[:-1] for values in grouped_examples[("genres", "collect")].to_pandas()], dtype=object)

In [23]:
grouped_examples[("timestamp", "collect")] = \
    np.array([values[:-1] for values in grouped_examples[("timestamp", "collect")].to_pandas()], dtype=object)

In [24]:
grouped_examples[("movieId", "collect")] = \
    np.array([values[:-1] for values in grouped_examples[("movieId", "collect")].to_pandas()], dtype=object)

In [25]:
grouped_examples[("movieId", "count")] = grouped_examples[("movieId", "count")] - 1

In [26]:
def flatten_genres(g):
    if len(g) > 0:
        return np.concatenate(g).ravel()
    else:
        return []

grouped_examples[("genres", "collect")] = \
    grouped_examples[("genres", "collect")].to_pandas().map(flatten_genres)

In [27]:
grouped_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,genres,timestamp,movieId,movieId,target_item
Unnamed: 0_level_1,Unnamed: 1_level_1,collect,collect,collect,collect,count,Unnamed: 7_level_1
userId,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,4146,"[43870, 30243, 31462, 25871, 28458, 44763, 371...","[6, 6, 3, 6, 17, 9, 16, 3, 9, 2, 3, 6, 10, 3, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361
2,4071,"[47665, 65105, 38902, 64349, 1452, 57559, 6574...","[6, 6, 6, 6, 6, 9, 9, 7, 9, 15, 7, 9, 2, 6, 6,...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150
3,7521,"[21601, 35319, 30185, 10741, 52728, 39667, 661...","[6, 9, 16, 9, 2, 6, 7, 9, 17, 3, 4, 5, 6, 10, ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",178,37729
3,7688,"[28185, 51431, 45744, 50807, 67439, 56292, 467...","[12, 12, 18, 7, 9, 18, 7, 9, 6, 9, 1, 1, 12, 2...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",9,4344
3,8045,"[15910, 13153, 52882, 41183, 63025, 72409, 107...","[7, 9, 17, 18, 7, 9, 18, 7, 9, 9, 7, 9, 9, 18,...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773


In [28]:
grouped_examples.columns

MultiIndex([('sampled_tag', 'collect'),
            (     'genres', 'collect'),
            (  'timestamp', 'collect'),
            (    'movieId', 'collect'),
            (    'movieId',   'count'),
            ('target_item',        '')],
           )

In [29]:
grouped_examples.columns = ["_".join(list(column_name)) for column_name in grouped_examples.columns]
grouped_examples.columns = [column_name.replace("_collect", "") for column_name in grouped_examples.columns]

In [30]:
grouped_examples["target_item"] = grouped_examples["target_item_"]
grouped_examples = grouped_examples.drop(labels="target_item_", axis=1)
grouped_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,genres,timestamp,movieId,movieId_count,target_item
userId,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[43870, 30243, 31462, 25871, 28458, 44763, 371...","[6, 6, 3, 6, 17, 9, 16, 3, 9, 2, 3, 6, 10, 3, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361
2,4071,"[47665, 65105, 38902, 64349, 1452, 57559, 6574...","[6, 6, 6, 6, 6, 9, 9, 7, 9, 15, 7, 9, 2, 6, 6,...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150
3,7521,"[21601, 35319, 30185, 10741, 52728, 39667, 661...","[6, 9, 16, 9, 2, 6, 7, 9, 17, 3, 4, 5, 6, 10, ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",178,37729
3,7688,"[28185, 51431, 45744, 50807, 67439, 56292, 467...","[12, 12, 18, 7, 9, 18, 7, 9, 6, 9, 1, 1, 12, 2...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",9,4344
3,8045,"[15910, 13153, 52882, 41183, 63025, 72409, 107...","[7, 9, 17, 18, 7, 9, 18, 7, 9, 9, 7, 9, 9, 18,...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773


In [31]:
grouped_examples["genre"] = grouped_examples["genres"]
grouped_examples = grouped_examples.drop(labels="genres", axis=1)
grouped_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,timestamp,movieId,movieId_count,target_item,genre
userId,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[43870, 30243, 31462, 25871, 28458, 44763, 371...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361,"[6, 6, 3, 6, 17, 9, 16, 3, 9, 2, 3, 6, 10, 3, ..."
2,4071,"[47665, 65105, 38902, 64349, 1452, 57559, 6574...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150,"[6, 6, 6, 6, 6, 9, 9, 7, 9, 15, 7, 9, 2, 6, 6,..."
3,7521,"[21601, 35319, 30185, 10741, 52728, 39667, 661...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",178,37729,"[6, 9, 16, 9, 2, 6, 7, 9, 17, 3, 4, 5, 6, 10, ..."
3,7688,"[28185, 51431, 45744, 50807, 67439, 56292, 467...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",9,4344,"[12, 12, 18, 7, 9, 18, 7, 9, 6, 9, 1, 1, 12, 2..."
3,8045,"[15910, 13153, 52882, 41183, 63025, 72409, 107...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773,"[7, 9, 17, 18, 7, 9, 18, 7, 9, 9, 7, 9, 9, 18,..."


In [32]:
grouped_examples.to_parquet(os.path.join(INPUT_DATA_DIR, "grouped_examples.parquet"))