In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Getting Started Outbrain: Download and Convert

## Overview

Outbrain dataset was published in [Kaggle Outbrain click prediction](https://www.kaggle.com/c/outbrain-click-prediction) competition, where the ‘Kagglers’ were challenged to predict on which ads and other forms of sponsored content its global users would click. One of  the top finishers' preprocessing and feature engineering pipeline is taken into consideration here, and this pipeline was restructured using NVTabular and cuDF. The Kaggle Outbrain click prediction challenge datasets can be downloaded from [here](https://www.kaggle.com/c/outbrain-click-prediction/data). 

## Getting Started

In [2]:
import os
import glob
import time
import datetime
import math

import cupy
import cudf
import rmm
from numba import cuda
import numpy as np

import nvtabular as nvt
from nvtabular.io import Shuffle
from nvtabular.ops import Normalize, FillMedian, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, HashBucket, TargetEncoding, get_embedding_sizes, Rename
from nvtabular.ops.column_similarity import ColumnSimilarity

from nvtabular import ColumnGroup

First, we set where the dataset should be saved once processed (OUTPUT_BUCKET_FOLDER), as well as where the dataset originally resides (DATA_BUCKET_FOLDER).

In [3]:
OUTPUT_BUCKET_FOLDER = os.environ.get("OUTPUT_DATA_DIR", "./preprocessed/")
DATA_BUCKET_FOLDER = os.environ.get("INPUT_DATA_DIR", "/datasets/outbrain/")

## Preparing Our Dataset

Here, we merge the component tables of our dataset into a single data frame, using [cuDF](https://github.com/rapidsai/cudf), which is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. We do this because NVTabular applies a workflow to a single table. We also re-initialize managed memory. `rmm.reinitialize()` provides an easy way to initialize RMM (RAPIDS Memory Manager) with specific memory resource options across multiple devices. The reason we re-initialize managed memory here is to allow us to perform memory intensive merge operation. Note that dask-cudf can also be used here.

In [4]:
from cudf import read_csv

# use managed memory for device memory allocation
rmm.reinitialize(managed_memory=True)  

# Merge all the CSV files together
documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv', na_values=['\\N', ''])
merged = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv', na_values=['\\N', ''])
             .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv', na_values=['\\N', '']), on="display_id", how="left", suffixes=('', '_event'))
             .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv', na_values=['\\N', '']), on="ad_id", how="left", suffixes=('', '_promo'))
             .merge(documents_meta, on="document_id", how="left")
             .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo")))

## Splitting into train and validation datasets

We use a time-stratified sample to create a validation set that is more recent, and save both our train and validation sets to parquet files to be read by NVTabular. Note that you should run the cell below only once, then save your `train` and `valid` data frames as parquet files. If you want to rerun this notebook you might end up with a different train-validation split each time because samples are drawn from a uniform distribution.

In [6]:
# Do a stratified split of the merged dataset into a training/validation dataset
merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
random_state = cudf.Series(cupy.random.uniform(size=len(merged)))
valid_set, train_set = merged.scatter_by_map(((merged.day_event <= 10) & (random_state > 0.2)).astype(int)) 

In [7]:
train_set.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,document_id_promo,campaign_id,advertiser_id,source_id,publisher_id,publish_time,source_id_promo,publisher_id_promo,publish_time_promo,day_event
0,52774,99112,0,528f0b082d474d,1788099,4141173,2.0,US>TX>600,1146355,12918,2722,10345.0,440.0,2016-06-13 21:00:00,3445.0,,,0
1,64722,155559,0,c8757cadd92f60,1794963,5272727,2.0,US>CA>807,1382920,79,53,1675.0,236.0,2016-06-14 00:00:00,3826.0,,2016-04-29 00:00:00,0
2,9341,240836,0,8abc1ee256a7ae,1783066,645068,3.0,US>MN>702,747296,25854,1809,105.0,206.0,2016-06-13 19:00:00,6523.0,,2015-06-19 09:00:00,0
3,11810,137369,0,cdad8696ea0329,1764997,821919,2.0,IN>25,1225363,17657,2988,482.0,65.0,2016-06-13 12:00:00,11604.0,,,0
4,60146,211592,0,cb29e30bcbcaa2,1783281,4826603,1.0,US>CA>807,1535449,24176,2352,6238.0,740.0,2016-06-18 13:00:00,11097.0,,,0


We save the dataset to disk.

In [8]:
train_filename = os.path.join(OUTPUT_BUCKET_FOLDER, "train_gdf.parquet")
valid_filename = os.path.join(OUTPUT_BUCKET_FOLDER, "valid_gdf.parquet")
train_set.to_parquet(train_filename, compression=None)
valid_set.to_parquet(valid_filename, compression=None)
merged = train_set = valid_set= None

In [9]:
rmm.reinitialize(managed_memory=False) 