In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
# import rmm

import numpy as np
import nvtabular as nvt

from sklearn.model_selection import train_test_split

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)

## Read Movie and Rating Features

In [4]:
movie_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "movie_features.parquet"))
movie_features = movie_features.drop(labels=["datetime", "created"], axis=1)
movie_features.head()

Unnamed: 0,tags_unique,movie_id,genres,tags_nunique
0,"[40345, 59079]",27265,[9],2
1,[28414],27273,[8],1
2,[32292],27266,[9],1
3,"[3365, 33048, 43053, 46467, 50807, 51310, 5739...",27282,"[9, 18]",10
4,"[31336, 34155, 42699, 48919, 48957, 51656, 602...",27290,[1],8


In [5]:
ratings_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "ratings_features.parquet"))
ratings_features.head()

Unnamed: 0,day,interaction,user_id,movie_id,rating,timestamp
0,4146,True,1,296,5.0,1147880044
1,4146,True,1,306,3.5,1147868817
2,4146,True,1,307,5.0,1147868828
3,4146,True,1,665,5.0,1147878820
4,4146,True,1,899,3.5,1147868510


## Join Ratings With Movie Features

In [6]:
joined_features = movie_features.merge(ratings_features)
joined_features.head()

Unnamed: 0,tags_unique,movie_id,genres,tags_nunique,day,interaction,user_id,rating,timestamp
0,"[2835, 10385, 13866, 17871, 18452, 20661, 2257...",3033,[9],20,8089,True,30,3.5,1488512045
1,"[1906, 29183, 39286, 44188, 45816, 55603, 5574...",1291,[12],8,8089,True,30,3.5,1488510832
2,[],33004,"[2, 12, 18]",0,8089,True,30,4.5,1488511515
3,"[1286, 3140, 5646, 14326, 19500, 31570, 36137,...",410,[6],15,8089,True,30,4.5,1488511992
4,"[21424, 25305, 29953, 31005, 31115, 37281, 375...",3175,"[2, 7, 18, 20]",18,8089,True,30,4.5,1488512028


In [7]:
sampled_indices = np.array([np.random.randint(0,nunique) if nunique > 0 else -1 for nunique in joined_features["tags_nunique"].to_pandas()], dtype=np.int32)

In [8]:
sampled_tags = np.array([tags[index] if index >= 0 else 0 for index, tags in zip(sampled_indices, joined_features["tags_unique"].to_pandas())])

In [9]:
joined_features["search_term"] = sampled_tags

In [10]:
joined_features["genre"] = joined_features["genres"]

In [11]:
joined_features = joined_features.drop(labels=["tags_unique", "tags_nunique", "interaction", "rating", "genres"], axis=1)

In [12]:
joined_features = joined_features.sort_values("timestamp")

In [13]:
joined_features.head()

Unnamed: 0,movie_id,day,user_id,timestamp,search_term,genre
171900,47,0,2262,789652009,6434,"[15, 18]"
1067604,57,385,13424,822873600,43856,[9]
1067626,11,385,13424,822873600,1818,"[6, 9, 16]"
3511943,32,385,42937,822873600,36380,"[15, 17, 18]"
8418463,18,385,102689,822873600,46652,[6]


In [15]:
training_examples = joined_features.groupby(["user_id", "day"]).agg(
    {
        "search_term": "collect",
        "genre": "collect",
        "timestamp": "collect",
        "movie_id": ["collect", "count"]
    }
)

In [16]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_term,genre,timestamp,movie_id,movie_id
Unnamed: 0_level_1,Unnamed: 1_level_1,collect,collect,collect,collect,count
user_id,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[[9, 12], [18], [4, 5, 10, 14], [8], [9], [9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",53
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[[9, 12], [9], [7, 9, 16], [18], [7, 9], [9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",125
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[[2, 6], [2, 6, 7], [3, 9], [3, 4, 5, 6, 10], ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",221
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[[9], [2, 17, 18], [6], [6, 9], [9], [8], [1],...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",19
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[[9], [9], [8], [9], [2, 3, 6, 7], [9, 16], [6...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",22


In [17]:
len(training_examples[training_examples[("movie_id", "count")] > 1])

532982

In [18]:
training_examples = training_examples[training_examples[("movie_id", "count")] > 1]

In [19]:
training_examples["target_item"] = training_examples[("movie_id", "collect")].list.get(-1)

In [20]:
training_examples[("target_item",)] = training_examples["target_item"]
training_examples = training_examples.drop(labels="target_item", axis=1)

In [21]:
training_examples.columns = ["_".join(list(column_name)) for column_name in training_examples.columns]
training_examples.columns = [column_name.replace("_collect", "s") for column_name in training_examples.columns]

In [22]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[[9, 12], [18], [4, 5, 10, 14], [8], [9], [9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",53,7361
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[[9, 12], [9], [7, 9, 16], [18], [7, 9], [9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",125,2150
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[[2, 6], [2, 6, 7], [3, 9], [3, 4, 5, 6, 10], ...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",221,37729
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[[9], [2, 17, 18], [6], [6, 9], [9], [8], [1],...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",19,4344
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[[9], [9], [8], [9], [2, 3, 6, 7], [9, 16], [6...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",22,27773


In [23]:
def remove_last_from_col(df, field_name):
    df[field_name] = np.array([values[:-1] for values in df[field_name].to_pandas()], dtype=object)
    return df

In [24]:
training_examples = remove_last_from_col(training_examples, "search_terms")
training_examples = remove_last_from_col(training_examples, "genres")
training_examples = remove_last_from_col(training_examples, "timestamps")
training_examples = remove_last_from_col(training_examples, "movie_ids")

In [25]:
training_examples["movie_id_count"] = training_examples["movie_id_count"] - 1

In [26]:
def flatten_genres(g):
    if len(g) > 0:
        return np.concatenate(g).ravel()
    else:
        return []

training_examples["genres"] = \
    training_examples["genres"].to_pandas().map(flatten_genres)

In [27]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[9, 12, 18, 4, 5, 10, 14, 8, 9, 9, 12, 18, 9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[9, 12, 9, 7, 9, 16, 18, 7, 9, 9, 12, 17, 9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[2, 6, 2, 6, 7, 3, 9, 3, 4, 5, 6, 10, 2, 9, 17...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",220,37729
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[9, 2, 17, 18, 6, 6, 9, 9, 8, 1, 8, 9, 10, 12,...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",18,4344
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[9, 9, 8, 9, 2, 3, 6, 7, 9, 16, 6, 7, 9, 18, 7...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773


In [28]:
training_examples.to_parquet(os.path.join(INPUT_DATA_DIR, "training_examples.parquet"))