In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

### Download and Extract Dataset

In [2]:
# External dependencies
import os

import cudf  # cuDF is an implementation of Pandas-like Dataframe on GPU
import inflection

from nvtabular.utils import download_file

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)

In [4]:
download_file(
    "http://files.grouplens.org/datasets/movielens/ml-25m.zip",
    os.path.join(INPUT_DATA_DIR, "ml-25m.zip"),
    redownload=False
)

downloading ml-25m.zip: 262MB [00:12, 20.7MB/s]                            
unzipping files: 100%|██████████| 8/8 [00:05<00:00,  1.52files/s]


### Movie Genres

In [5]:
movies = cudf.read_csv(os.path.join(INPUT_DATA_DIR, "ml-25m/movies.csv"))
movies.columns = [inflection.underscore(c) for c in movies.columns]
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies["genres"] = movies["genres"].str.split("|")
movies = movies.drop("title", axis=1)
movies = movies.set_index("movie_id")
movies.head()

Unnamed: 0_level_0,genres
movie_id,Unnamed: 1_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,"[Adventure, Children, Fantasy]"
3,"[Comedy, Romance]"
4,"[Comedy, Drama, Romance]"
5,[Comedy]


### Movie Tags

In [7]:
tags = cudf.read_csv(os.path.join(INPUT_DATA_DIR, "ml-25m/tags.csv"))
tags.columns = [inflection.underscore(c) for c in tags.columns]
tags.head()

Unnamed: 0,user_id,movie_id,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [8]:
tags = tags.drop("user_id", axis=1)
tags = tags.drop("timestamp", axis=1)
tags = tags.groupby("movie_id").agg({"tag": ["unique", "nunique"]})
tags.columns = tags.columns.map('s_'.join)
tags.reset_index(inplace=True)
tags = tags.set_index("movie_id")
tags.head()

Unnamed: 0_level_0,tags_unique,tags_nunique
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[2009 reissue in Stereoscopic 3-D, 3D, 55 movi...",126
2,"[Adaptation of Book, CLV, Children, Chris Van ...",44
3,"[Ann Margaret, Burgess Meredith, CLV, Daryl Ha...",23
4,"[CLV, based on novel or book, characters, chic...",8
5,"[CLV, Comedy, Diane Keaton, Fantasy, Steve Mar...",20


In [9]:
raw_movie_features = movies.join(tags, how="outer")

In [10]:
raw_movie_features.reset_index(inplace=True)
raw_movie_features.head()

Unnamed: 0,movie_id,genres,tags_unique,tags_nunique
0,128721,[Drama],"[emigration, prostitute]",2
1,128740,[Documentary],[adoption],1
2,128723,[Drama],[biography],1
3,128762,"[Drama, Thriller]","[Berlinale, bourgeoisie, foreigner, holiday, l...",10
4,128780,[(no genres listed)],"[bank robbery, car chase, fishing, johann seba...",8


In [11]:
raw_movie_features.to_parquet(os.path.join(INPUT_DATA_DIR, "raw_movie_features.parquet"))