In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
import rmm

import numpy as np
import nvtabular as nvt

from sklearn.model_selection import train_test_split

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)

In [4]:
rmm.reinitialize(managed_memory=True)

## Read Movie and Rating Features

In [5]:
movie_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "movie_features.parquet"))
movie_features = movie_features.drop(labels=["datetime", "created"], axis=1)
movie_features.head()

Unnamed: 0,tags_unique,movie_id,genres,tags_nunique
0,"[40345, 59079]",27265,[9],2
1,[28414],27273,[8],1
2,[32292],27266,[9],1
3,"[3365, 33048, 43053, 46467, 50807, 51310, 5739...",27282,"[9, 18]",10
4,"[31336, 34155, 42699, 48919, 48957, 51656, 602...",27290,[1],8


In [6]:
ratings_features = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "positive_ratings.parquet"))
ratings_features.head()

Unnamed: 0,day,interaction,user_id,movie_id,rating,timestamp
0,4146,True,1,293,5.0,1147880044
2,4146,True,1,304,5.0,1147868828
3,4146,True,1,655,5.0,1147878820
5,4146,True,1,1062,4.0,1147868495
8,4146,True,1,1206,5.0,1147868839


## Join Ratings With Movie Features

In [7]:
joined_features = movie_features.merge(ratings_features)
joined_features.head()

Unnamed: 0,tags_unique,movie_id,genres,tags_nunique,day,interaction,user_id,rating,timestamp
0,"[968, 969, 2817, 3219, 3545, 6191, 6224, 8906,...",1226,[12],141,8358,True,235,5.0,1511746457
1,"[8179, 8750, 9676, 10495, 14493, 19001, 19815,...",3780,[6],21,8369,True,235,5.0,1512712388
2,"[1136, 3906, 4088, 5084, 5212, 5693, 6224, 810...",2183,"[2, 6, 7, 18]",48,8369,True,235,5.0,1512712359
3,"[966, 3086, 3116, 3708, 5328, 6191, 6224, 7006...",4890,"[9, 16]",119,8371,True,235,5.0,1512901853
4,"[395, 397, 401, 429, 431, 434, 640, 685, 1266,...",1238,"[3, 6, 17]",162,8358,True,235,5.0,1511746769


In [8]:
sampled_indices = np.array([np.random.randint(0,nunique) if nunique > 0 else -1 for nunique in joined_features["tags_nunique"].to_pandas()], dtype=np.int32)

In [9]:
sampled_tags = np.array([tags[index] if index >= 0 else 0 for index, tags in zip(sampled_indices, joined_features["tags_unique"].to_pandas())], dtype=np.int32)

In [10]:
sampled_tags.dtype

dtype('int32')

In [11]:
joined_features["search_term"] = sampled_tags

In [12]:
joined_features["genre"] = joined_features["genres"]

In [13]:
joined_features.head()

Unnamed: 0,tags_unique,movie_id,genres,tags_nunique,day,interaction,user_id,rating,timestamp,search_term,genre
0,"[968, 969, 2817, 3219, 3545, 6191, 6224, 8906,...",1226,[12],141,8358,True,235,5.0,1511746457,62082,[12]
1,"[8179, 8750, 9676, 10495, 14493, 19001, 19815,...",3780,[6],21,8369,True,235,5.0,1512712388,19001,[6]
2,"[1136, 3906, 4088, 5084, 5212, 5693, 6224, 810...",2183,"[2, 6, 7, 18]",48,8369,True,235,5.0,1512712359,9603,"[2, 6, 7, 18]"
3,"[966, 3086, 3116, 3708, 5328, 6191, 6224, 7006...",4890,"[9, 16]",119,8371,True,235,5.0,1512901853,3086,"[9, 16]"
4,"[395, 397, 401, 429, 431, 434, 640, 685, 1266,...",1238,"[3, 6, 17]",162,8358,True,235,5.0,1511746769,35598,"[3, 6, 17]"


In [14]:
joined_features = joined_features.drop(labels=["tags_unique", "tags_nunique", "interaction", "rating", "genres"], axis=1)

In [15]:
joined_features = joined_features.sort_values("timestamp")

In [16]:
joined_features.head()

Unnamed: 0,movie_id,day,user_id,timestamp,search_term,genre
171547,47,0,2262,789652009,20317,"[15, 18]"
1008717,11,385,13424,822873600,38219,"[6, 9, 16]"
1008725,57,385,13424,822873600,72236,[9]
3283303,32,385,42937,822873600,40197,"[15, 17, 18]"
7880224,1,385,102689,822873600,70854,"[3, 4, 5, 6, 10]"


In [17]:
joined_features["search_term"].dtype

dtype('int32')

In [18]:
training_examples = joined_features.groupby(["user_id", "day"]).agg(
    {
        "search_term": "collect",
        "genre": "collect",
        "timestamp": "collect",
        "movie_id": ["collect", "count"]
    }
)

In [19]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_term,genre,timestamp,movie_id,movie_id
Unnamed: 0_level_1,Unnamed: 1_level_1,collect,collect,collect,collect,count
user_id,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,4146,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[[3, 10], [9, 17, 18], [3, 9, 19], [3, 4, 5, 6...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",39
2,4071,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[[3, 10], [6, 16], [2, 3, 17, 18], [9, 17, 18]...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",113
3,7521,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[[6, 9, 16, 19], [7, 12, 18], [3, 4, 5, 6, 10]...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",248
3,7688,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[[7, 9, 17, 18], [2, 9, 19], [6, 16], [8], [20...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",18
3,8045,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[[7, 15, 18], [2, 18], [6, 7, 18], [4, 17], [6...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",31


In [20]:
len(training_examples[training_examples[("movie_id", "count")] > 1])

541872

In [21]:
training_examples = training_examples[training_examples[("movie_id", "count")] > 1]

In [22]:
training_examples["target_item"] = training_examples[("movie_id", "collect")].list.get(-1)

In [23]:
training_examples[("target_item",)] = training_examples["target_item"]
training_examples = training_examples.drop(labels="target_item", axis=1)

In [24]:
training_examples.columns = ["_".join(list(column_name)) for column_name in training_examples.columns]
training_examples.columns = [column_name.replace("_collect", "s") for column_name in training_examples.columns]

In [25]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[[3, 10], [9, 17, 18], [3, 9, 19], [3, 4, 5, 6...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",39,7237
2,4071,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[[3, 10], [6, 16], [2, 3, 17, 18], [9, 17, 18]...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",113,2061
3,7521,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[[6, 9, 16, 19], [7, 12, 18], [3, 4, 5, 6, 10]...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",248,24542
3,7688,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[[7, 9, 17, 18], [2, 9, 19], [6, 16], [8], [20...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",18,4240
3,8045,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[[7, 15, 18], [2, 18], [6, 7, 18], [4, 17], [6...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",31,9335


In [26]:
def remove_last_from_col(df, field_name):
    df[field_name] = np.array([values[:-1] for values in df[field_name].to_pandas()], dtype=object)
    return df

In [27]:
training_examples = remove_last_from_col(training_examples, "search_terms")
training_examples = remove_last_from_col(training_examples, "genres")
training_examples = remove_last_from_col(training_examples, "timestamps")
training_examples = remove_last_from_col(training_examples, "movie_ids")

In [28]:
training_examples["movie_id_count"] = training_examples["movie_id_count"] - 1

In [29]:
def flatten_genres(g):
    if len(g) > 0:
        return np.concatenate(g).ravel()
    else:
        return []

training_examples["genres"] = \
    training_examples["genres"].to_pandas().map(flatten_genres)

In [30]:
training_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[40280, 34864, 60921, 55756, 53948, 968, 2649,...","[3, 10, 9, 17, 18, 3, 9, 19, 3, 4, 5, 6, 6, 9,...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6259, 3353, 1062, 6589, 384...",38,7237
2,4071,"[70987, 21602, 55885, 217, 26361, 361, 38094, ...","[3, 10, 6, 16, 2, 3, 17, 18, 9, 17, 18, 2, 3, ...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 150, 234, 4781, ...",112,2061
3,7521,"[63271, 34088, 581, 28225, 491, 34656, 41947, ...","[6, 9, 16, 19, 7, 12, 18, 3, 4, 5, 6, 10, 2, 1...","[1439472199, 1439472203, 1439472215, 143947222...","[352, 586, 1, 2481, 258, 315, 1167, 523, 12217...",247,24542
3,7688,"[69734, 68604, 71594, 56924, 32424, 31525, 668...","[7, 9, 17, 18, 2, 9, 19, 6, 16, 8, 20, 6, 10, ...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",17,4240
3,8045,"[38644, 43776, 43440, 57466, 45557, 13968, 170...","[7, 15, 18, 2, 18, 6, 7, 18, 4, 17, 6, 7, 9, 7...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",30,9335


In [31]:
training_examples.dtypes

search_terms       list
genres             list
timestamps         list
movie_ids          list
movie_id_count    int32
target_item       int32
dtype: object

In [32]:
training_examples.to_parquet(os.path.join(INPUT_DATA_DIR, "retrieval_training.parquet"))

In [33]:
rmm.reinitialize(managed_memory=False)