In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [None]:
evaluation = False
evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "./dataset/"

In [3]:
import numpy as np
import pandas as pd
import math
import datetime
import time
import glob

import cudf
import cupy
from numba import cuda
import rmm

import nvtabular as nvt
from nvtabular import io as nvt_io
from nvtabular import ops as ops
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter, HashBucket, FillMedian
from nvtabular.column_similarity import ColumnSimilarity

In [4]:
# %%time
# from cudf import read_csv
# import rmm
# rmm.reinitialize(managed_memory=True)  

# documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv')
# train_set = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv')
#              .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv'), on="display_id", how="left", suffixes=('', '_event'))
#              .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv'), on="ad_id", how="left", suffixes=('', '_promo'))
#              .merge(documents_meta, on="document_id", how="left")
#              .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo"))
#             )
# train_set.to_parquet(OUTPUT_BUCKET_FOLDER+"train_pdf.parquet", compression=None)
# train_set = None

In [5]:
#The smoothed ctr function we adapt with the LambdaOP
# def smoothed_ctr(clicks, displays, prior_ctr=0.1, prior_weight=10):
#     return (clicks + prior_ctr * prior_weight) / (displays + prior_weight)

In [6]:
documents_categories_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_categories.csv')
documents_topics_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_topics.csv')
documents_entities_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_entities.csv')

In [7]:
# read in document categories/topics/entities as cupy sparse matrices
def df_to_coo(df, row="document_id", col=None, data="confidence_level"):
    return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))

categories = df_to_coo(documents_categories_cudf, col="category_id")
topics = df_to_coo(documents_topics_cudf, col="topic_id")
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype("category").cat.codes
entities = df_to_coo(documents_entities_cudf, col="entity_id")

In [9]:
#train_df = cudf.read_parquet('train_df.parquet', n_rows=)

In [10]:
#CATEGORICAL_COLUMNS=['document_id', 'ad_id', 'document_id_promo', 'source_id_promo', 'source_id', 'advertiser_id', 'publisher_id_promo', 'publisher_id']

In [11]:
# #find the categorical column with highest cardinality
# count_uniques={}
# for col in CATEGORICAL_COLUMNS:
#     count_uniques[col] = len(train_df[col].value_counts())

# highest_card_col = max(count_uniques, key=count_uniques.get) 
# print(highest_card_col) 
# count_uniques[highest_card_col]

In [None]:
TIMESTAMP_DELTA = 1465876799998
def calculate_delta(col,gdf):
    delta = ((gdf['timestamp']+TIMESTAMP_DELTA).astype('datetime64[ms]') - col.astype('datetime64[ns]')).dt.days
    delta = delta * (delta >=0) * (delta<=10*365)
    return delta

In [None]:
CATEGORICAL_COLUMNS =['display_id', 'ad_id', 'uuid', 'document_id','platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time','source_id_promo', 'publisher_id_promo', 'publish_time_promo']
CONTINUOUS_COLUMNS = ['timestamp']

workflow = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names= CONTINUOUS_COLUMNS,
    label_name=['clicked'])

workflow.add_feature([
    LambdaOp(
        op_name='day',
        f=lambda col, gdf: (col / 1000 / 60 / 60 / 24).astype(int),
        columns=['timestamp'], replace=False),
    LambdaOp(
        op_name='country',
        f=lambda col, gdf: col.str.slice(0,2),
        columns=['geo_location'], replace=False),
    LambdaOp(
        op_name='state',
        f=lambda col, gdf: col.str.slice(0,5),
        columns=['geo_location'],replace=False),
    LambdaOp(
        op_name='days_since_published',
        f=calculate_delta,
        columns=['publish_time','publish_time_promo'], replace=False),
    
    FillMedian(columns=['publish_time_days_since_published','publish_time_promo_days_since_published']),
    
    Dropna(columns=['geo_location', 'platform']),
    
    JoinGroupby(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'], 
        cont_names=['clicked'],stats=['sum','count']),
    
    #calculate the smoothed ctr
    LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['ad_id_count']+10),
         columns=['ad_id_clicked_sum'],replace=False),
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['source_id_count']+10),
         columns=['source_id_clicked_sum'],replace=False),
      LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['document_id_promo_count']+10),
         columns=['document_id_promo_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['publisher_id_count']+10),
        columns=['publisher_id_clicked_sum'], replace=False),
    LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['advertiser_id_count']+10),
        columns=['advertiser_id_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['campaign_id_count']+10),
        columns=['campaign_id_clicked_sum'], replace=False),
    
    #take the log of the views and clicks
    LogOp(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
       'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
       'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
    
    #normalize the views and clicks
    Normalize(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 
           'document_id_promo_count', 'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 
           'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
     
    Dropna(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'])
    ])


HASH_BUCKET_SIZES = {'document_id': 300000, 'ad_id': 250000, 
                     'document_id_promo': 100000, 'source_id_promo': 4000, 
                     'source_id': 4000, 'advertiser_id': 2500,
                     'publisher_id_promo': 1000, 'publisher_id': 1000}

workflow.add_preprocess([
    Categorify(columns=['geo_location_country','geo_location','geo_location_state', 'uuid'],freq_threshold=10),
    
    HashBucket(HASH_BUCKET_SIZES)])

op = ColumnSimilarity("doc_event_doc_ad_sim_categories", "document_id", categories, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_topics", "document_id", topics, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

op = ColumnSimilarity("doc_event_doc_ad_sim_entities", "document_id", entities, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

workflow.finalize()

train_dataset = nvt.Dataset('train_df.parquet')

workflow.apply(train_dataset, apply_offline=True, record_stats=True, output_path='./preprocessed/files/', shuffle=True, out_files_per_proc=2)

In [18]:
train_df= cudf.read_parquet('./preprocessed/files/*.parquet')

In [20]:
train_df.shape

(10000, 43)

In [22]:
train_df.head()

Unnamed: 0,timestamp,display_id,ad_id,uuid,document_id,platform,geo_location,document_id_promo,campaign_id,advertiser_id,...,advertiser_id_count,advertiser_id_clicked_sum,campaign_id_count,campaign_id_clicked_sum,ad_id_clicked_sum_ctr,source_id_clicked_sum_ctr,document_id_promo_clicked_sum_ctr,publisher_id_clicked_sum_ctr,advertiser_id_clicked_sum_ctr,campaign_id_clicked_sum_ctr
0,998982,14325,141826,0,293707,2,104,35759,1423,1261,...,-0.025891,0.680721,0.922834,1.49686,0.214286,0.142857,0.214286,0.142857,0.232558,0.228571
1,509175,7424,158433,0,192947,2,139,42498,9236,2396,...,-0.05301,0.383915,-1.025028,-0.189249,0.090909,0.153846,0.1,0.188679,0.190476,0.166667
2,85861,1230,89380,0,220943,2,67,54015,10186,1967,...,0.773857,0.923229,-1.025028,-1.032304,0.090909,0.114286,0.083333,0.142857,0.131868,0.083333
3,433666,6324,141349,0,110525,1,44,42372,11203,1282,...,-1.76729,-1.460015,-1.390759,-1.032304,0.090909,0.165138,0.076923,0.175926,0.142857,0.090909
4,598438,8692,86165,0,27534,2,30,98204,2610,1713,...,0.728426,0.680721,-1.390759,-1.032304,0.090909,0.197802,0.090909,0.210526,0.114943,0.090909
