In [1]:
evaluation = False
evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "/dataset/"

In [2]:
import numpy as np
import pandas as pd
import math
import datetime
import time
import glob

import cudf
import cupy
from numba import cuda
import rmm

import nvtabular as nvt
from nvtabular import io as nvt_io
from nvtabular import ops as ops
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter, HashBucket, FillMedian
from nvtabular.column_similarity import ColumnSimilarity

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [4]:
%%time
import cupy
from cudf import read_csv
import rmm
rmm.reinitialize(managed_memory=True)  
# Merge all the CSV files together
documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv')
merged = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv')
             .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv'), on="display_id", how="left", suffixes=('', '_event'))
             .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv'), on="ad_id", how="left", suffixes=('', '_promo'))
             .merge(documents_meta, on="document_id", how="left")
             .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo"))
            )

CPU times: user 38 µs, sys: 55 µs, total: 93 µs
Wall time: 106 µs


In [7]:
# Do a stratified split of the merged dataset into a training/validation dataset
merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
random_state = cudf.Series(cupy.random.uniform(size=len(merged)))
valid_set, train_set = merged.scatter_by_map(((merged.day_event <= 10) & (random_state > 0.2)).astype(int)) 
train_set.to_parquet(OUTPUT_BUCKET_FOLDER+"train_gdf.parquet", compression=None)
valid_set.to_parquet(OUTPUT_BUCKET_FOLDER+"valid_gdf.parquet", compression=None)
merged = train_set = None

In [17]:
rmm.reinitialize(managed_memory=False)  

In [18]:
documents_categories_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_categories.csv')
documents_topics_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_topics.csv')
documents_entities_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_entities.csv')

In [19]:
# read in document categories/topics/entities as cupy sparse matrices
def df_to_coo(df, row="document_id", col=None, data="confidence_level"):
    return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))

categories = df_to_coo(documents_categories_cudf, col="category_id")
topics = df_to_coo(documents_topics_cudf, col="topic_id")
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype("category").cat.codes
entities = df_to_coo(documents_entities_cudf, col="entity_id")

documents_categories_cudf=None
documents_topics_cudf =None
documents_entities_cudf=None

In [20]:
HASH_BUCKET_SIZES = {
    'document_id': 300000,
    'ad_id': 250000,
    'document_id_promo': 100000,
    'source_id_promo': 4000,
    'source_id': 4000,
    'advertiser_id': 2500,
    'publisher_id_promo': 1000,
    'publisher_id': 1000}

In [21]:
TIMESTAMP_DELTA = 1465876799998
def calculate_delta(col,gdf):
    delta = ((gdf['timestamp']+TIMESTAMP_DELTA).astype('datetime64[ms]') - col.astype('datetime64[ns]')).dt.days
    delta = delta * (delta >=0) * (delta<=10*365)
    return delta

In [None]:
%%time
CATEGORICAL_COLUMNS =['display_id', 'ad_id', 'uuid', 'document_id','platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time','source_id_promo', 'publisher_id_promo', 'publish_time_promo']
CONTINUOUS_COLUMNS = ['timestamp']

workflow = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names= CONTINUOUS_COLUMNS,
    label_name=['clicked'])

workflow.add_feature([
    LambdaOp(
        op_name='country',
        f=lambda col, gdf: col.str.slice(0,2),
        columns=['geo_location'], replace=False),
    LambdaOp(
        op_name='state',
        f=lambda col, gdf: col.str.slice(0,5),
        columns=['geo_location'],replace=False),
    LambdaOp(
        op_name='days_since_published',
        f=calculate_delta,
        columns=['publish_time','publish_time_promo'], replace=False),
    
    FillMedian(columns=['publish_time_days_since_published','publish_time_promo_days_since_published']),
    
    Dropna(columns=['geo_location', 'platform']),
    
    JoinGroupby(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'], 
        cont_names=['clicked'],stats=['sum','count']),
    
    #calculate the smoothed ctr
    LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['ad_id_count']+10),
         columns=['ad_id_clicked_sum'],replace=False),
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['source_id_count']+10),
         columns=['source_id_clicked_sum'],replace=False),
      LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['document_id_promo_count']+10),
         columns=['document_id_promo_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['publisher_id_count']+10),
        columns=['publisher_id_clicked_sum'], replace=False),
    LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['advertiser_id_count']+10),
        columns=['advertiser_id_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['campaign_id_count']+10),
        columns=['campaign_id_clicked_sum'], replace=False),
    
    #take the log of the views and clicks
    LogOp(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
       'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
       'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
    
    #normalize the views and clicks
    Normalize(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
           'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
           'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
     
    Dropna(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'])
    ])

workflow.add_preprocess([
    Categorify(columns=['geo_location_country','geo_location','geo_location_state', 'uuid'],freq_threshold=10),
    
    HashBucket(HASH_BUCKET_SIZES)])

op = ColumnSimilarity("doc_event_doc_ad_sim_categories", "document_id", categories, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_topics", "document_id", topics, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_entities", "document_id", entities, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

workflow.finalize()

train_dataset = nvt.Dataset(OUTPUT_BUCKET_FOLDER+'train_gdf.parquet', part_mem_fraction=0.12)
valid_dataset = nvt.Dataset(OUTPUT_BUCKET_FOLDER+'valid_gdf.parquet', part_mem_fraction=0.12)

workflow.apply(train_dataset, record_stats=True, output_path='./preprocessed/train/', shuffle=True, out_files_per_proc=1)
workflow.apply(valid_dataset, record_stats=False, output_path='./preprocessed/valid/', shuffle=False, out_files_per_proc=1)

In [None]:
train_df= cudf.read_parquet('./preprocessed/train/*.parquet')