In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
evaluation = False
evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "./dataset/"

In [3]:
import numpy as np
import pandas as pd
import math
import datetime
import time
import glob

import cudf
import cupy
from numba import cuda
import rmm

import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter, HashBucket, FillMedian
from nvtabular.column_similarity import ColumnSimilarity

In [4]:
%%time
from cudf import read_csv
import rmm
# rmm.reinitialize(managed_memory=True)  

# documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv')
# train_set = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv')
#              .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv'), on="display_id", how="left", suffixes=('', '_event'))
#              .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv'), on="ad_id", how="left", suffixes=('', '_promo'))
#              .merge(documents_meta, on="document_id", how="left")
#              .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo"))
#             )
# train_set.to_parquet("train_pdf.parquet", compression=None)
# train_set = None

CPU times: user 13 µs, sys: 8 µs, total: 21 µs
Wall time: 31.5 µs


In [5]:
documents_categories_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_categories.csv',
    names=['document_id', 'category_id', 'confidence_level'],
    dtype={'document_id': 'int64','category_id': 'int64','confidence_level': 'float32'},
    skiprows=1)

documents_topics_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_topics.csv',
    names=['document_id', 'topic_id', 'confidence_level'],
    dtype={'document_id': 'int64','topic_id': 'int64','confidence_level': 'float32'},
    skiprows=1)

documents_entities_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_entities.csv',

    names=['document_id', 'entity_id', 'confidence_level'],
    dtype={'document_id': 'int64','entity_id': 'str','confidence_level': 'float32'},
    skiprows=1)

In [30]:
# read in document categories/topics/entities as cupy sparse matrices
def df_to_coo(df, row="document_id", col=None, data="confidence_level"):
    return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))

categories = df_to_coo(documents_categories_cudf, col="category_id")
topics = df_to_coo(documents_topics_cudf, col="topic_id")
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype("category").cat.codes
entities = df_to_coo(documents_entities_cudf, col="entity_id")

In [29]:
documents_entities_cudf.head(2)

Unnamed: 0,document_id,entity_id,confidence_level
0,1524246,367427,0.672865
1,1524246,126453,0.399114


In [None]:
HASH_BUCKET_SIZES = {
    'document_id': 300000,
    'ad_id': 250000,
    'document_id_promo': 100000,
    'source_id_promo': 4000,
    'source_id': 4000,
    'advertiser_id': 2500,
    'publisher_id_promo': 1000,
    'publisher_id': 1000,
}

In [7]:
train_df = cudf.read_parquet('train_df.parquet', num_rows=1000000)

In [12]:
train_df.head(2)

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,document_id_promo,campaign_id,advertiser_id,source_id,publisher_id,publish_time,source_id_promo,publisher_id_promo,publish_time_promo
0,6041,238431,0,6f3f3d1fefd794,681170,413687,2,IN>36,1598289,25757,3803,822.0,253.0,2011-05-17 00:00:00,13241.0,,
1,6041,262396,0,6f3f3d1fefd794,681170,413687,2,IN>36,1153962,13268,2757,822.0,253.0,2011-05-17 00:00:00,10982.0,,


In [13]:
TIMESTAMP_DELTA = 1465876799998
def calculate_delta(col,gdf):
    
    delta = ((gdf['timestamp']+TIMESTAMP_DELTA).astype('datetime64[ms]') - col.astype('datetime64[ns]')).dt.days
    delta = delta * (delta >=0) * (delta<=10*365)
    return delta

In [14]:
CATEGORICAL_COLUMNS =['display_id', 'ad_id', 'uuid', 'document_id',
       'platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time',
       'source_id_promo', 'publisher_id_promo', 'publish_time_promo']
CONTINUOUS_COLUMNS = ['timestamp']

In [16]:
proc = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names= CONTINUOUS_COLUMNS,
    label_name=['clicked'])

proc.add_preprocess(
    [
    LambdaOp(
        op_name='day',
        f=lambda col, gdf: (col / 1000 / 60 / 60 / 24).astype(int),
        columns=['timestamp'],
        replace=False
    ),
    LambdaOp(
        op_name='country',
        f=lambda col, gdf: col.str.slice(0,2),
        columns=['geo_location'],
        replace=False
    ),
    LambdaOp(
        op_name='state',
        f=lambda col, gdf: col.str.slice(0,5),
        columns=['geo_location'],
        replace=False
    ),
    Categorify(columns=['uuid']),
    
        LambdaOp(
         op_name='days_since_published',
         f=calculate_delta,
         columns=['publish_time','publish_time_promo'],
         replace=False),
    FillMedian(columns=['publish_time_days_since_published','publish_time_promo_days_since_published'])
    ])

proc.add_feature(
    [
    Dropna(['geo_location', 'platform']),
    JoinGroupby(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'], 
        cont_names=['clicked'],
        stats=['sum','count'])
    ])

proc.finalize()

GPU_MEMORY_FRAC = 0.15

train_dataset = nvt.Dataset(train_df, engine='parquet',  part_mem_fraction=GPU_MEMORY_FRAC)

proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path='./preprocessed/pop_count/', shuffle=True, out_files_per_proc=1)



In [17]:
train_df= cudf.read_parquet('./preprocessed/pop_count/*.parquet')

In [18]:
train_df.head()

Unnamed: 0,timestamp,display_id,ad_id,uuid,document_id,platform,geo_location,document_id_promo,campaign_id,advertiser_id,...,document_id_promo_count,document_id_promo_clicked_sum,publisher_id_count,publisher_id_clicked_sum,advertiser_id_count,advertiser_id_clicked_sum,campaign_id_count,campaign_id_clicked_sum,publish_time_days_since_published,publish_time_promo_days_since_published
0,12223172,128863,12032,76740,1763064,1,US>PA>504,174953,2036,1365,...,388,82,440,74,388,82,173,37,0,1041
1,19377462,182101,153193,50286,1745955,2,NZ>E7,1233659,19459,2808,...,1449,447,3754,719,3238,806,1359,435,1,74
2,15226848,150409,227875,102996,1803770,2,US,1541772,25136,3680,...,205,88,8710,2226,720,324,116,48,0,17
3,2180141,29827,90550,24176,1627419,1,FR>C1,1119697,11725,2653,...,13,4,641,120,20,6,13,4,4,74
4,9641926,107179,201778,47154,708485,2,US>MI>505,1331825,23406,3016,...,376,95,7365,1358,832,182,376,95,367,0


In [42]:
proc = nvt.Workflow(
    cat_names=['display_id', 'ad_id', 'uuid', 'document_id', 'platform', 'geo_location', 'document_id_promo', 'campaign_id', 'advertiser_id',
       'source_id', 'publisher_id', 'source_id_promo','publisher_id_promo', 'timestamp_day','geo_location_country', 'geo_location_state'],
    cont_names=['timestamp', 'ad_id_clicked_sum', 'ad_id_count', 'source_id_count', 'source_id_clicked_sum',
       'document_id_promo_count', 'document_id_promo_clicked_sum', 'publisher_id_count', 'publisher_id_clicked_sum', 'advertiser_id_count',
       'advertiser_id_clicked_sum', 'campaign_id_count','campaign_id_clicked_sum'],
    label_name=['clicked'])

proc.add_preprocess(
    [
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['ad_id_count'],
         columns=['ad_id_clicked_sum'],
         replace=False
     ),
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['source_id_count'],
         columns=['source_id_clicked_sum'],
         replace=False
     ),
      LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['document_id_promo_count'],
         columns=['document_id_promo_clicked_sum'],
         replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['publisher_id_count'],
        columns=['publisher_id_clicked_sum'],
        replace=False),
    LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['advertiser_id_count'],
        columns=['advertiser_id_clicked_sum'],
        replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['campaign_id_count'],
        columns=['campaign_id_clicked_sum'],
        replace=False)   
    ])
    
proc.add_feature([
    Dropna(['ad_id', 'source_id', 'document_id_promo', 
            'publisher_id', 'advertiser_id', 'campaign_id']),
    LogOp(['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
           'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
           'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum'])
    ])
proc.add_cont_preprocess(
    Normalize(['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
           'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
           'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']))

proc.add_preprocess(
    [
     Categorify(columns=['geo_location_country','geo_location','geo_location_state'],freq_threshold=10),
     HashBucket(HASH_BUCKET_SIZES)
    ])

op = ColumnSimilarity("doc_event_doc_ad_sim_categories", "document_id", categories, "document_id_promo", metric='tfidf', on_device=True)
proc.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_topics", "document_id", topics, "document_id_promo", metric='tfidf', on_device=True)
proc.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_entities", "document_id", entities, "document_id_promo", metric='tfidf', on_device=True)
proc.add_feature(op)

proc.finalize()

GPU_MEMORY_FRAC = 0.2
train_dataset = nvt.Dataset(train_df, engine='parquet',  part_mem_fraction=GPU_MEMORY_FRAC)
proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path='./preprocessed/ctr/', shuffle=True, out_files_per_proc=1)



In [43]:
train_gdf= cudf.read_parquet('./preprocessed/ctr/*.parquet')

In [44]:
train_gdf.shape

(999964, 39)

In [34]:
train_gdf.head(2)

Unnamed: 0,timestamp,ad_id_clicked_sum,ad_id_count,source_id_count,source_id_clicked_sum,document_id_promo_count,document_id_promo_clicked_sum,publisher_id_count,publisher_id_clicked_sum,advertiser_id_count,...,clicked,doc_event_doc_ad_sim_categories,doc_event_doc_ad_sim_entities,doc_event_doc_ad_sim_topics,ad_id_clicked_sum_ctr,source_id_clicked_sum_ctr,document_id_promo_clicked_sum_ctr,publisher_id_clicked_sum_ctr,advertiser_id_clicked_sum_ctr,campaign_id_clicked_sum_ctr
0,2429977,-0.451191,-0.607827,-0.698735,-0.633166,-0.602908,-0.647168,-0.994551,-0.877643,-0.867009,...,0,0.99384,0.0,0.0,0.833793,0.799341,0.833793,0.799341,0.839728,0.833793
1,16276752,-0.466567,-0.610636,-0.69863,-0.632842,-0.603497,-0.652831,-0.994445,-0.877309,-0.866892,...,0,0.0,0.0,0.0,0.643706,0.773911,0.764033,0.775754,0.85261,0.649525


In [35]:
train_gdf.isnull().any()

timestamp                            False
ad_id_clicked_sum                    False
ad_id_count                          False
source_id_count                      False
source_id_clicked_sum                False
document_id_promo_count              False
document_id_promo_clicked_sum        False
publisher_id_count                   False
publisher_id_clicked_sum             False
advertiser_id_count                  False
advertiser_id_clicked_sum            False
campaign_id_count                    False
campaign_id_clicked_sum              False
display_id                           False
ad_id                                False
uuid                                 False
document_id                          False
platform                             False
geo_location                         False
document_id_promo                    False
campaign_id                          False
advertiser_id                        False
source_id                            False
publisher_i

In [36]:
train_gdf.dtypes

timestamp                              int64
ad_id_clicked_sum                    float32
ad_id_count                          float32
source_id_count                      float32
source_id_clicked_sum                float32
document_id_promo_count              float32
document_id_promo_clicked_sum        float32
publisher_id_count                   float32
publisher_id_clicked_sum             float32
advertiser_id_count                  float32
advertiser_id_clicked_sum            float32
campaign_id_count                    float32
campaign_id_clicked_sum              float32
display_id                             int64
ad_id                                  int32
uuid                                   int32
document_id                            int32
platform                              object
geo_location                           int64
document_id_promo                      int32
campaign_id                            int64
advertiser_id                          int32
source_id 