In [None]:
import os
import numpy as np
import glob
import math
import datetime
import time

import pandas as pd
import cudf
import cupy
from numba import cuda
import rmm

import nvtabular as nvt

from nvtabular.ops import Normalize, FillMedian,FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, HashBucket
from nvtabular.column_similarity import ColumnSimilarity

import tensorflow as tf
print(tf.__version__)
from tensorflow.python.feature_column import feature_column_v2 as fc
from nvtabular.tf_dataloader import KerasSequenceDataset

In [2]:
# evaluation = False
# evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "/dataset/"

In [3]:
# %%time
from cudf import read_csv
rmm.reinitialize(managed_memory=True)  
# Merge all the CSV files together
documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv', na_values=['\\N', ''])
merged = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv', na_values=['\\N', ''])
             .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv', na_values=['\\N', '']), on="display_id", how="left", suffixes=('', '_event'))
             .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv', na_values=['\\N', '']), on="ad_id", how="left", suffixes=('', '_promo'))
             .merge(documents_meta, on="document_id", how="left")
             .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo"))
        )

In [3]:
output_train_dir = os.path.join(OUTPUT_BUCKET_FOLDER, 'train/')
output_valid_dir = os.path.join(OUTPUT_BUCKET_FOLDER, 'valid/')
! mkdir -p $output_train_dir
! mkdir -p $output_valid_dir

In [5]:
# # Do a stratified split of the merged dataset into a training/validation dataset
merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
random_state = cudf.Series(cupy.random.uniform(size=len(merged)))
valid_set, train_set = merged.scatter_by_map(((merged.day_event <= 10) & (random_state > 0.2)).astype(int)) 
train_set.to_parquet(OUTPUT_BUCKET_FOLDER+"train_gdf.parquet", compression=None)
valid_set.to_parquet(OUTPUT_BUCKET_FOLDER+"valid_gdf.parquet", compression=None)
merged = train_set = valid_set= None

In [6]:
rmm.reinitialize(managed_memory=False) 

In [7]:
documents_categories_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_categories.csv')
documents_topics_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_topics.csv')
documents_entities_cudf = cudf.read_csv(DATA_BUCKET_FOLDER + 'documents_entities.csv')

In [8]:
# read in document categories/topics/entities as cupy sparse matrices
def df_to_coo(df, row="document_id", col=None, data="confidence_level"):
    return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))

categories = df_to_coo(documents_categories_cudf, col="category_id")
topics = df_to_coo(documents_topics_cudf, col="topic_id")
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype("category").cat.codes
entities = df_to_coo(documents_entities_cudf, col="entity_id")

documents_categories_cudf=None
documents_topics_cudf =None
documents_entities_cudf=None

In [15]:
HASH_BUCKET_SIZES = {
    'document_id': 300000,
    'ad_id': 250000,
    'document_id_promo': 100000,
    'source_id_promo': 4000,
    'source_id': 4000,
    'advertiser_id': 2500,
    'publisher_id_promo': 1000,
    'publisher_id': 1000
}
IDENTITY_NUM_BUCKETS = {
    'platform': 5,
    'geo_location':2988,
    'geo_location_country':230,
    'geo_location_state': 2500
}

EMBEDDING_DIMENSIONS = {
    'document_id': 128,
    'ad_id': 128,
    'document_id_promo': 128,
    'source_id': 64,
    'source_id_promo': 64,
    'geo_location': 64,
    'advertiser_id': 64,
    'geo_location_state': 64,
    'publisher_id_promo': 64,
    'publisher_id': 64,
    'geo_location_country': 64,
}
dtypes = {
    'document_id':np.int32,
    'document_id_promo':np.int32,
    'source_id':np.int32,
    'source_id_promo':np.int32,
    'geo_location':np.int32,
    'geo_location_country':np.int32,
    'geo_location_state':np.int32,
    'publisher_id':np.int32,
    'platform':np.int32,
    'document_id_promo_clicked_sum_ctr':np.float32,
    'publisher_id_clicked_sum_ctr':np.float32,
    'source_id_clicked_sum_ctr':np.float32,
    'document_id_promo_count':np.float32,
    'publish_time_days_since_published':np.float32,
    'ad_id':np.int32,
    'source_id_promo':np.int32,
    'advertiser_id':np.int32,
    'publisher_id_promo':np.int32,
    'ad_id_clicked_sum_ctr':np.float32,
    'advertiser_id_clicked_sum_ctr':np.float32,
    'campaign_id_clicked_sum_ctr':np.float32,
    'ad_id_count':np.float32,
    'publish_time_promo_days_since_published':np.float32,
}

In [10]:
CATEGORICAL_COLUMNS =['display_id', 'ad_id', 'uuid', 'document_id','platform', 'geo_location', 'document_id_promo', 'campaign_id','advertiser_id', 
                      'source_id', 'publisher_id', 'publish_time','source_id_promo', 'publisher_id_promo', 'publish_time_promo', 'day_event']
CONTINUOUS_COLUMNS = ['timestamp']

TIMESTAMP_DELTA = 1465876799998

def calculate_delta(col,gdf):
    col.loc[col == ""] = None
    col = col.astype('datetime64[ns]')
    timestamp = (gdf['timestamp']+TIMESTAMP_DELTA).astype('datetime64[ms]')
    delta = (timestamp - col).dt.days
    delta = delta * (delta >=0) * (delta<=10*365)
    return delta

In [11]:
workflow = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names= CONTINUOUS_COLUMNS,
    label_name=['clicked'])

workflow.add_feature([
    LambdaOp(
        op_name='country',
        f=lambda col, gdf: col.str.slice(0,2),
        columns=['geo_location'], replace=False),
    LambdaOp(
        op_name='state',
        f=lambda col, gdf: col.str.slice(0,5),
        columns=['geo_location'],replace=False),
    LambdaOp(
        op_name='days_since_published',
        f=calculate_delta,
        columns=['publish_time','publish_time_promo'], replace=False),
    
    FillMedian(columns=['publish_time_days_since_published','publish_time_promo_days_since_published']),
    
    JoinGroupby(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'], 
        cont_names=['clicked'],stats=['sum','count']),
    
    #calculate the smoothed ctr
    LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['ad_id_count']+10),
         columns=['ad_id_clicked_sum'],replace=False),
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['source_id_count']+10),
         columns=['source_id_clicked_sum'],replace=False),
      LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: (col + 0.1 * 10)/(gdf['document_id_promo_count']+10),
         columns=['document_id_promo_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['publisher_id_count']+10),
        columns=['publisher_id_clicked_sum'], replace=False),
    LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['advertiser_id_count']+10),
        columns=['advertiser_id_clicked_sum'], replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: (col + 0.1 * 10)/(gdf['campaign_id_count']+10),
        columns=['campaign_id_clicked_sum'], replace=False), 
    
    #take the log of the views and clicks
    LogOp(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 'document_id_promo_count', 'document_id_promo_clicked_sum',
                   'publisher_id_count','publisher_id_clicked_sum', 'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
    
    #normalize the views and clicks
    Normalize(columns=['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 'document_id_promo_count', 'document_id_promo_clicked_sum',
                       'publisher_id_count','publisher_id_clicked_sum', 'advertiser_id_count', 'advertiser_id_clicked_sum', 'campaign_id_count', 'campaign_id_clicked_sum']),
    #fill missing values
    FillMissing(columns= ['ad_id_count', 'ad_id_clicked_sum','source_id_count', 'source_id_clicked_sum', 'document_id_promo_count', 
        'document_id_promo_clicked_sum','publisher_id_count','publisher_id_clicked_sum', 'advertiser_id_count', 'advertiser_id_clicked_sum', 
        'campaign_id_count', 'campaign_id_clicked_sum', 'advertiser_id_clicked_sum_ctr','document_id_promo_clicked_sum_ctr', 
        'publisher_id_clicked_sum_ctr', 'source_id_clicked_sum_ctr', 'ad_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr']),
    
    Dropna(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'])])

workflow.add_preprocess([
    Categorify(columns=['geo_location_country','geo_location','geo_location_state', 'uuid', 'platform'],freq_threshold=10),
    
    HashBucket(HASH_BUCKET_SIZES)])

op = ColumnSimilarity("doc_event_doc_ad_sim_categories", "document_id", categories, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)
op = ColumnSimilarity("doc_event_doc_ad_sim_topics", "document_id", topics, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)
op = ColumnSimilarity("doc_event_doc_ad_sim_entities", "document_id", entities, "document_id_promo", metric='tfidf', on_device=False)
workflow.add_feature(op)

workflow.finalize()

train_dataset = nvt.Dataset(OUTPUT_BUCKET_FOLDER+'train_gdf.parquet', part_mem_fraction=0.12)
valid_dataset = nvt.Dataset(OUTPUT_BUCKET_FOLDER+'valid_gdf.parquet', part_mem_fraction=0.12)

workflow.apply(train_dataset, record_stats=True, output_path=output_train_dir, shuffle=True, out_files_per_proc=5)
workflow.apply(valid_dataset, record_stats=False, output_path=output_valid_dir, shuffle=False, out_files_per_proc=5)

  idf = np.log(N / np.bincount(X.col))


In [11]:
train_paths = glob.glob(os.path.join(output_train_dir, "*.parquet"))
valid_paths = glob.glob(os.path.join(output_valid_dir, "*.parquet"))

In [16]:
CATEGORICAL_COLUMNS = [col for col in dtypes if dtypes[col] == np.int32]
NUMERIC_COLUMNS = [col for col in dtypes if dtypes[col] == np.float32]

In [19]:
def get_feature_columns():
    wide_columns, deep_columns = [], []

    for column_name in CATEGORICAL_COLUMNS:
        if column_name in HASH_BUCKET_SIZES: # Changing hashing to identity + adding modulo to dataloader
            categorical_column = tf.feature_column.categorical_column_with_identity(
                column_name, num_buckets=HASH_BUCKET_SIZES[column_name])

        elif column_name in IDENTITY_NUM_BUCKETS:
            categorical_column = tf.feature_column.categorical_column_with_identity(
                column_name, num_buckets=IDENTITY_NUM_BUCKETS[column_name])
        else:
            raise ValueError(f'Unexpected categorical column found {column_name}')

        if column_name in EMBEDDING_DIMENSIONS:
            print(column_name)
            wrapped_column = tf.feature_column.embedding_column(
                categorical_column,
                dimension=EMBEDDING_DIMENSIONS[column_name],
                combiner='mean')
        else:
            wrapped_column = tf.feature_column.indicator_column(categorical_column)
            
        wide_columns.append(categorical_column)
        deep_columns.append(wrapped_column)
    
    numerics = [tf.feature_column.numeric_column(column_name, shape=(1,),dtype=tf.float32) 
                for column_name in NUMERIC_COLUMNS]
    
    wide_columns.extend(numerics)
    deep_columns.extend(numerics)
       
    return wide_columns, deep_columns

In [20]:
deep_hidden_units=[1024,1024,1024,1024,1024]
deep_dropout=.1

In [21]:
wide_columns, deep_columns = get_feature_columns()

wide_weighted_outputs = []  # a list of (batch_size, 1) contributors to the linear weighted sum
numeric_dense_inputs = []  # NumericColumn inputs; to be concatenated and then fed to a dense layer
wide_columns_dict = {}  # key : column
deep_columns_dict = {}  # key : column
features = {}  # tf.keras.Input placeholders for each feature to be used

# construct input placeholders for wide features
for col in wide_columns:
    print(col.key)
    features[col.key] = tf.keras.Input(shape=(1,),
                                       batch_size=None, 
                                       name=col.key,
                                       dtype=dtypes[col.key],
                                       sparse=False)
    wide_columns_dict[col.key] = col
for col in deep_columns:
    is_embedding_column = ('key' not in dir(col))
    key = col.categorical_column.key if is_embedding_column else col.key

    if key not in features:
        features[key] = tf.keras.Input(shape=(1,), 
                                       batch_size=None, 
                                       name=key, 
                                       dtype=dtypes[col.key], 
                                       sparse=False)
    deep_columns_dict[key] = col

for key in wide_columns_dict:
    if key in HASH_BUCKET_SIZES: # For Kyle, features are already hashed and no multivalued, I've removed it 
        wide_weighted_outputs.append(tf.keras.layers.Flatten()(tf.keras.layers.Embedding(
            HASH_BUCKET_SIZES[key], 1, input_length=1)(features[key])))

    elif key in IDENTITY_NUM_BUCKETS:
        wide_weighted_outputs.append(tf.keras.layers.Flatten()(tf.keras.layers.Embedding(
            IDENTITY_NUM_BUCKETS[key], 1, input_length=1)(features[key])))
    else:
        numeric_dense_inputs.append(features[key])

categorical_output_contrib = tf.keras.layers.add(wide_weighted_outputs,
                                                 name='categorical_output')
numeric_dense_tensor = tf.keras.layers.concatenate(
    numeric_dense_inputs, name='numeric_dense')
deep_columns = list(deep_columns_dict.values())

dnn = tf.keras.layers.DenseFeatures(deep_columns, name='deep_embedded')(features)
for unit_size in deep_hidden_units:
    dnn = tf.keras.layers.Dense(units=unit_size)(dnn)
    dnn = tf.keras.layers.Dropout(rate=deep_dropout)(dnn)
    dnn = tf.keras.layers.BatchNormalization(momentum=.999)(dnn)
dnn = tf.keras.layers.Dense(units=1)(dnn)
dnn_model = tf.keras.Model(inputs=features,
                           outputs=dnn)
linear_output = categorical_output_contrib + tf.keras.layers.Dense(1)(numeric_dense_tensor)

linear_model = tf.keras.Model(inputs=features,
                              outputs=linear_output)

wide_and_deep_model = tf.keras.experimental.WideDeepModel(
    linear_model, dnn_model, activation='sigmoid')

document_id
document_id_promo
source_id
source_id_promo
geo_location
geo_location_country
geo_location_state
publisher_id
ad_id
advertiser_id
publisher_id_promo
document_id
document_id_promo
source_id
source_id_promo
geo_location
geo_location_country
geo_location_state
publisher_id
platform
ad_id
advertiser_id
publisher_id_promo
document_id_promo_clicked_sum_ctr
publisher_id_clicked_sum_ctr
source_id_clicked_sum_ctr
document_id_promo_count
publish_time_days_since_published
ad_id_clicked_sum_ctr
advertiser_id_clicked_sum_ctr
campaign_id_clicked_sum_ctr
ad_id_count
publish_time_promo_days_since_published


In [13]:
#print(deep_columns)

In [22]:
TRAIN_PATHS = sorted(glob.glob('./preprocessed/train/*.parquet'))
VALID_PATHS = sorted(glob.glob('./preprocessed/valid/*.parquet'))
train_dataset_tf = KerasSequenceDataset(
    TRAIN_PATHS, # you could also use a glob pattern
    CATEGORICAL_COLUMNS+NUMERIC_COLUMNS,
    batch_size=131072,
    label_name='clicked',
    shuffle=True,
    buffer_size=1 # how many batches to load at once
)

valid_dataset_tf = KerasSequenceDataset(
    VALID_PATHS, # you could also use a glob pattern
    CATEGORICAL_COLUMNS+NUMERIC_COLUMNS,
    batch_size=131072,
    label_name='clicked',
    shuffle=False,
    buffer_size=1
)

In [24]:
wide_optimizer = tf.keras.optimizers.Ftrl(
        learning_rate=.1,
)

deep_optimizer = tf.keras.optimizers.Adam(
        learning_rate=.2
) 

In [None]:
wide_and_deep_model.compile(
    optimizer=[wide_optimizer, deep_optimizer],
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
    experimental_run_tf_function=False
)
wide_and_deep_model.fit(train_dataset_tf, validation_data=valid_dataset_tf, epochs=2)