In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
evaluation = False
evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "/dataset/"

In [88]:
import numpy as np

import math
import datetime
import time

import pandas as pd
import cudf
import cupy
from numba import cuda
import rmm

import nvtabular as nvt

from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter
from nvtabular.column_similarity import ColumnSimilarity

In [13]:
%%time
import cudf
import cupy
#from pandas import read_csv
from cudf import read_csv
import rmm
# rmm.reinitialize(managed_memory=True)  

# documents_meta = read_csv(DATA_BUCKET_FOLDER + 'documents_meta.csv')
# train_set = (read_csv(DATA_BUCKET_FOLDER+'clicks_train.csv')
#              .merge(read_csv(DATA_BUCKET_FOLDER + 'events.csv'), on="display_id", how="left", suffixes=('', '_event'))
#              .merge(read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv'), on="ad_id", how="left", suffixes=('', '_promo'))
#              .merge(documents_meta, on="document_id", how="left")
#              .merge(documents_meta, left_on="document_id_promo", right_on="document_id", how="left", suffixes=('', "_promo"))
#             )
# train_set.to_parquet("train_pdf.parquet", compression=None)
# train_set = None

CPU times: user 29 µs, sys: 25 µs, total: 54 µs
Wall time: 65.6 µs


In [8]:
# country_utc_dst_cudf = cudf.read_csv('./data/country_codes_utc_dst_tz_delta.csv', keep_default_na=False)
# us_states_utc_dst_cudf = cudf.read_csv('./data/us_states_abbrev_bst.csv', keep_default_na=False)
# ca_states_utc_dst_cudf = cudf.read_csv('./data/ca_states_abbrev_bst.csv', keep_default_na=False)

In [26]:
train_df = cudf.read_parquet('train_df.parquet', num_rows=10000)

In [27]:
train_df.columns

Index(['display_id', 'ad_id', 'clicked', 'uuid', 'document_id', 'timestamp',
       'platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time',
       'source_id_promo', 'publisher_id_promo', 'publish_time_promo'],
      dtype='object')

In [28]:
CATEGORICAL_COLUMNS =['display_id', 'ad_id', 'uuid', 'document_id',
       'platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time',
       'source_id_promo', 'publisher_id_promo', 'publish_time_promo']
CONTINUOUS_COLUMNS = ['timestamp']

In [90]:
proc = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names= CONTINUOUS_COLUMNS,
    label_name=['clicked'])

proc.add_preprocess(
    [
    LambdaOp(
        op_name='day',
        f=lambda col, gdf: (col / 1000 / 60 / 60 / 24).astype(int),
        columns=['timestamp'],
        replace=False
    ),
    LambdaOp(
        op_name='country',
        f=lambda col, gdf: col.str.slice(0,2),
        columns=['geo_location'],
        replace=False
    ),
    LambdaOp(
    op_name='state',
    f=lambda col, gdf: col.str.slice(0,5),
    columns=['geo_location'],
    replace=False
    ),
    Categorify(columns=['uuid'])
    ]
)
#apply dropna
proc.add_feature(
    [
    Dropna(['geo_location']),
    Dropna(['platform']),
    JoinGroupby(columns=['ad_id', 'source_id', 'document_id_promo', 'publisher_id', 'advertiser_id', 'campaign_id'], 
        cont_names=['clicked'],
        stats=['sum','count'])
    ])

proc.finalize()


GPU_MEMORY_FRAC = 0.2

train_dataset = nvt.Dataset(train_df, engine='parquet',  part_mem_fraction=GPU_MEMORY_FRAC)

proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path='./preprocessed/pop_count/', shuffle=False, out_files_per_proc=1)



In [91]:
train_gdf= cudf.read_parquet('./preprocessed/pop_count/*.parquet')

In [92]:
train_gdf.head(2)

Unnamed: 0,timestamp,display_id,ad_id,uuid,document_id,platform,geo_location,document_id_promo,campaign_id,advertiser_id,...,source_id_count,source_id_clicked_sum,document_id_promo_count,document_id_promo_clicked_sum,publisher_id_count,publisher_id_clicked_sum,advertiser_id_count,advertiser_id_clicked_sum,campaign_id_count,campaign_id_clicked_sum
0,27679,371,67292,761,1179111,2,US>MI>505,1043039,8711,1919,...,197,31,37,2,197,31,238,7,37,2
1,27679,371,80033,761,1179111,2,US>MI>505,1087952,10565,407,...,197,31,5,3,197,31,46,10,5,3


In [93]:
train_gdf.columns

Index(['timestamp', 'display_id', 'ad_id', 'uuid', 'document_id', 'platform',
       'geo_location', 'document_id_promo', 'campaign_id', 'advertiser_id',
       'source_id', 'publisher_id', 'publish_time', 'source_id_promo',
       'publisher_id_promo', 'publish_time_promo', 'clicked', 'timestamp_day',
       'geo_location_country', 'geo_location_state', 'ad_id_count',
       'ad_id_clicked_sum', 'source_id_count', 'source_id_clicked_sum',
       'document_id_promo_count', 'document_id_promo_clicked_sum',
       'publisher_id_count', 'publisher_id_clicked_sum', 'advertiser_id_count',
       'advertiser_id_clicked_sum', 'campaign_id_count',
       'campaign_id_clicked_sum'],
      dtype='object')

In [94]:
proc = nvt.Workflow(
    cat_names=['display_id', 'ad_id', 'uuid', 'document_id', 'platform',
       'geo_location', 'document_id_promo', 'campaign_id', 'advertiser_id',
       'source_id', 'publisher_id', 'publish_time', 'source_id_promo',
       'publisher_id_promo', 'publish_time_promo', 'timestamp_day',
       'geo_location_country', 'geo_location_state'],
    cont_names=['timestamp', 'ad_id_clicked_sum', 'ad_id_count', 'source_id_count', 'source_id_clicked_sum',
       'document_id_promo_count', 'document_id_promo_clicked_sum', 'publisher_id_count', 'publisher_id_clicked_sum', 'advertiser_id_count',
       'advertiser_id_clicked_sum', 'campaign_id_count','campaign_id_clicked_sum'],
    label_name=['clicked'])

proc.add_preprocess(
    [
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['ad_id_count'],
         columns=['ad_id_clicked_sum'],
         replace=False
     ),
     LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['source_id_count'],
         columns=['source_id_clicked_sum'],
         replace=False
     ),
      LambdaOp(
         op_name='ctr',
         f=lambda col, gdf: col/gdf['document_id_promo_count'],
         columns=['document_id_promo_clicked_sum'],
         replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['publisher_id_count'],
        columns=['publisher_id_clicked_sum'],
        replace=False),
    LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['advertiser_id_count'],
        columns=['advertiser_id_clicked_sum'],
        replace=False),
     LambdaOp(
        op_name='ctr',
        f=lambda col, gdf: col/gdf['campaign_id_count'],
        columns=['campaign_id_clicked_sum'],
        replace=False)   
    ])
    
proc.add_feature([
    Dropna(['ad_id']),
    Dropna(['source_id']),
    Dropna(['document_id_promo']),
    Dropna(['publisher_id']),
    Dropna(['advertiser_id']),
    Dropna(['campaign_id'])
    ])

proc.add_preprocess([
    Filter(f=lambda gdf: gdf[gdf.ad_id_count > 5]),
    Filter(f=lambda gdf: gdf[gdf.source_id_count > 10]),
    Filter(f=lambda gdf: gdf[gdf.document_id_promo_count > 5]),
    Filter(f=lambda gdf: gdf[gdf.publisher_id_count > 10]),
    Filter(f=lambda gdf: gdf[gdf.advertiser_id_count > 10]),
    Filter(f=lambda gdf: gdf[gdf.campaign_id_count > 10]),
])

proc.finalize()

GPU_MEMORY_FRAC = 0.2
train_dataset = nvt.Dataset(train_gdf, engine='parquet',  part_mem_fraction=GPU_MEMORY_FRAC)
proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path='./preprocessed/ctr/', shuffle=False, out_files_per_proc=1)



In [95]:
train_gdf= cudf.read_parquet('./preprocessed/ctr/*.parquet')

In [96]:
train_gdf.shape

(2205, 38)

In [99]:
train_gdf.head(2)

Unnamed: 0,timestamp,ad_id_clicked_sum,ad_id_count,source_id_count,source_id_clicked_sum,document_id_promo_count,document_id_promo_clicked_sum,publisher_id_count,publisher_id_clicked_sum,advertiser_id_count,...,timestamp_day,geo_location_country,geo_location_state,clicked,ad_id_clicked_sum_ctr,source_id_clicked_sum_ctr,document_id_promo_clicked_sum_ctr,publisher_id_clicked_sum_ctr,advertiser_id_clicked_sum_ctr,campaign_id_clicked_sum_ctr
0,843199,3,21,194,36,47,7,218,41,52,...,0,US,US>FL,0,0.142857,0.185567,0.148936,0.188073,0.134615,0.159091
1,451121,9,44,87,19,47,10,97,22,50,...,0,US,US>NV,0,0.204545,0.218391,0.212766,0.226804,0.2,0.183673


In [97]:
train_gdf.columns

Index(['timestamp', 'ad_id_clicked_sum', 'ad_id_count', 'source_id_count',
       'source_id_clicked_sum', 'document_id_promo_count',
       'document_id_promo_clicked_sum', 'publisher_id_count',
       'publisher_id_clicked_sum', 'advertiser_id_count',
       'advertiser_id_clicked_sum', 'campaign_id_count',
       'campaign_id_clicked_sum', 'display_id', 'ad_id', 'uuid', 'document_id',
       'platform', 'geo_location', 'document_id_promo', 'campaign_id',
       'advertiser_id', 'source_id', 'publisher_id', 'publish_time',
       'source_id_promo', 'publisher_id_promo', 'publish_time_promo',
       'timestamp_day', 'geo_location_country', 'geo_location_state',
       'clicked', 'ad_id_clicked_sum_ctr', 'source_id_clicked_sum_ctr',
       'document_id_promo_clicked_sum_ctr', 'publisher_id_clicked_sum_ctr',
       'advertiser_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr'],
      dtype='object')

In [98]:
#just to check if there is any nulls 
train_gdf['ad_id'].isnull().sum(), train_gdf['publisher_id'].isnull().sum(),  train_gdf['campaign_id'].isnull().sum(),  train_gdf['document_id_promo'].isnull().sum()

(0, 0, 0, 0)

In [101]:
# just to check the min `views`
train_gdf['ad_id_count'].min(), train_gdf['source_id_count'].min(), train_gdf['document_id_promo_count'].min()

(6, 11, 6)

In [30]:
TIMESTAMP_DELTA = 1465876799998
(new_gdf['timestamp'] + TIMESTAMP_DELTA).astype('datetime64[ms]')
new_gdf['event_hour'] = (new_gdf['timestamp'] + TIMESTAMP_DELTA).astype('datetime64[ms]').dt.hour

In [31]:
new_gdf['event_hour']

0     5
1     5
2     4
3     4
4     4
     ..
95    5
96    5
97    5
98    5
99    4
Name: event_hour, Length: 100, dtype: int16

In [None]:
# page_views_cudf = cudf.read_csv(
#     DATA_BUCKET_FOLDER + 'page_views.csv',
#     names=['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv', 'geo_location_pv', 'traffic_source_pv'],
#     dtype={'uuid_pv': 'str', 'document_id_pv': 'int64', 'timestamp_pv': 'int64',
#            'platform_pv': 'int64', 'geo_location_pv': 'str', 'traffic_source_pv': 'int64'},
#     skiprows=1,
#     keep_default_na=False)

# page_views_cudf['day_pv'] = (page_views_cudf['timestamp_pv'] / 1000 / 60 / 60 / 24).astype(int)

# page_views_users_cudf =page_views_cudf.groupby(by=['uuid_pv', 'document_id_pv'])['timestamp_pv'].agg('max').reset_index()


In [6]:
documents_categories_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_categories.csv',
    names=['document_id_cat', 'category_id', 'confidence_level_cat'],
    dtype={'document_id_cat': 'int64','category_id': 'int64','confidence_level_cat': 'float32'},
    skiprows=1)

documents_topics_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_topics.csv',
    names=['document_id_top', 'topic_id', 'confidence_level_top'],
    dtype={'document_id_top': 'int64','topic_id': 'int64','confidence_level_top': 'float32'},
    skiprows=1)

documents_entities_cudf = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_entities.csv',

    names=['document_id_ent', 'entity_id', 'confidence_level_ent'],
    dtype={'document_id_ent': 'int64','entity_id': 'str','confidence_level_ent': 'float32'},
    skiprows=1)

In [23]:
def get_category_field_values_counts(field, df, min_threshold=10):
    category_counts = dict(list(filter(lambda x: not is_null(x[0]) and x[1] >= min_threshold, df.select(field).groupBy(field).count().rdd.map(lambda x: (x[0], x[1])).collect())))
    #Adding a special value to create a feature for values in this category that are less than min_threshold 
    category_counts[LESS_SPECIAL_CAT_VALUE] = -1
    return category_counts


In [24]:
LESS_SPECIAL_CAT_VALUE = 'less'

In [25]:
 def get_category_field_values_counts_cudf(field, df, min_threshold=10):
    df = df.groupby(field)[field].agg('count').to_frame()


    df.rename(columns={field:'count'},inplace =True)

    df = df[df['count'] >= min_threshold].reset_index()
    df = cudf.concat([df,cudf.DataFrame({field:['less'], "count":[-1]})],ignore_index=True)
    return df

In [26]:
event_country_values_counts = get_category_field_values_counts_cudf('event_country', events_cudf, min_threshold=10)
#All non-null categories: 230

event_country_state_values_counts = get_category_field_values_counts_cudf('event_country_state', events_cudf, min_threshold=10)

event_geo_location_values_counts = get_category_field_values_counts_cudf('geo_location_event', events_cudf, min_threshold=10)
#All non-null categories: 2988

doc_entity_id_values_counts = get_category_field_values_counts_cudf('entity_id', documents_entities_cudf, min_threshold=10)

In [21]:
#documents_total = len(documents_meta_cudf)

In [59]:
def get_ctr(df):
    return df['clicks'] /df['views']

In [60]:
def gather_stats(gdf):
    avg_ctr = gdf['ctr'].sum() / len(gdf['ctr'])
    weighted_avg_ctr = (gdf['ctr'] * gdf['views']).sum() / gdf['views'].sum()
    median_views = gdf['views'].median()
    mean_views= gdf['views'].mean()
    return avg_ctr,weighted_avg_ctr,median_views,mean_views

In [65]:
def get_popularity_cudf(gdf, fields,get_distinct_ad_ids = True,additional_output=[]):
    grouped = gdf.groupby(fields)
    gdf = gdf.groupby(fields)['clicked'].agg(['sum','count']).rename(columns={'sum':'clicks','count':'views'})
    #print(gdf.head())
    if(get_distinct_ad_ids):
        gdf['distinct_ad_ids'] = grouped['ad_id'].nunique()
    gdf['ctr'] = get_ctr(gdf)
    for field in additional_output:
        gdf[field[0]] = grouped[field[1]].agg([field[2]])
    gdf.reset_index(inplace=True)
    final_fields = fields + ['clicks','views','ctr']
    if(get_distinct_ad_ids):
        final_fields.append('distinct_ad_ids')
    final_fields += [field[0] for field in additional_output]
    gdf = gdf[final_fields]
   
    return gdf

In [66]:
get_popularity_cudf(train_df,['ad_id'])

Unnamed: 0,ad_id,clicks,views,ctr,distinct_ad_ids
0,1174,0,1,0.0,1
1,1222,0,1,0.0,1
2,1266,0,1,0.0,1
3,1605,0,1,0.0,1
4,2138,0,1,0.0,1
...,...,...,...,...,...
4382,347979,0,2,0.0,1
4383,348052,0,1,0.0,1
4384,348078,1,2,0.5,1
4385,348721,0,1,0.0,1


In [68]:
#train_set_cudf = clicks_train_joined_cudf

In [69]:
def get_confidence_sample_size(sample, max_for_reference=100000):
    #Avoiding overflow for large sample size

    return (np.log(1+sample) / float(math.log(1+max_for_reference, 2))).where(sample <= max_for_reference, 1.0)

In [97]:
#ad_id_popularity_cudf = get_popularity_cudf(train_set_cudf,['ad_id'])
ad_id_popularity_cudf = get_popularity_cudf(train_df,['ad_id'])
ad_id_popularity_cudf.head(2)

Unnamed: 0,ad_id,clicks,views,ctr,distinct_ad_ids
0,3358,0,1,0.0,1
1,3510,0,1,0.0,1


In [98]:
ad_id_popularity_cudf = ad_id_popularity_cudf[ad_id_popularity_cudf['views'] >5]
ad_id_avg_ctr_cudf,ad_id_weighted_avg_ctr_cudf,ad_id_views_median_cudf,ad_id_views_mean_cudf= gather_stats(ad_id_popularity_cudf)
ad_id_popularity_cudf = ad_id_popularity_cudf
ad_id_popularity_cudf['distinct_ad_ids'] = 1
ad_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            ad_id_popularity_cudf['views'] / \
            ad_id_popularity_cudf['distinct_ad_ids'].astype(float))

In [99]:
ad_id_popularity_cudf

Unnamed: 0,ad_id,clicks,views,ctr,distinct_ad_ids,confidence
337,139682,5,9,0.555556,1,0.138629
470,173005,2,6,0.333333,1,0.117155
481,174547,0,6,0.0,1,0.117155


In [70]:

#document_id_popularity_cudf = get_popularity_cudf(train_set_cudf,['document_id_promo'])
document_id_popularity_cudf = get_popularity_cudf(train_df,['document_id_promo'])
document_id_popularity_cudf = document_id_popularity_cudf[document_id_popularity_cudf['views'] >5]
#document_id_avg_ctr_cudf,document_id_weighted_avg_ctr_cudf,document_id_views_median_cudf,document_id_views_mean_cudf = gather_stats(document_id_popularity_cudf)
#document_id_popularity_cudf = document_id_popularity_cudf
document_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            document_id_popularity_cudf['views'] / \
            document_id_popularity_cudf['distinct_ad_ids'].astype(float))

In [71]:
document_id_popularity_cudf

Unnamed: 0,document_id_promo,clicks,views,ctr,distinct_ad_ids,confidence
5,25191,1,7,0.142857,1,0.125195
14,136145,0,7,0.000000,1,0.125195
21,153509,5,22,0.227273,3,0.127652
36,208091,2,29,0.068966,12,0.073973
42,257734,2,13,0.153846,6,0.069398
...,...,...,...,...,...,...
2704,1748636,2,6,0.333333,1,0.117155
2705,1749429,1,15,0.066667,5,0.083463
2716,1760291,7,49,0.142857,2,0.194988
2730,1778599,2,15,0.133333,2,0.128845


In [35]:

# doc_event_doc_ad_avg_ctr_cudf = get_popularity_cudf(train_set_cudf,['document_id_event','document_id_promo'])
# doc_event_doc_ad_avg_ctr_cudf = doc_event_doc_ad_avg_ctr_cudf[doc_event_doc_ad_avg_ctr_cudf['views'] >5]
# doc_event_doc_ad_avg_ctr_cudf =doc_event_doc_ad_avg_ctr_cudf.set_index(['document_id_event','document_id_promo'])

# source_id_by_country_popularity_cudf = get_popularity_cudf(train_set_cudf,['event_country','source_id'])
# source_id_by_country_popularity_cudf = source_id_by_country_popularity_cudf[source_id_by_country_popularity_cudf['views'] >5]
# source_id_by_country_avg_ctr_cudf, source_id_by_country_weighted_avg_ctr_cudf,source_id_by_country_views_median_cudf,source_id_by_country_views_mean_cudf = gather_stats(source_id_by_country_popularity_cudf)


In [73]:
source_id_popularity_cudf = get_popularity_cudf(train_df,['source_id'])
source_id_popularity_cudf = source_id_popularity_cudf[source_id_popularity_cudf['views'] > 10]
source_id_popularity_cudf = source_id_popularity_cudf[source_id_popularity_cudf['source_id'].isnull() != True]
source_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            source_id_popularity_cudf['views'] / \
            source_id_popularity_cudf['distinct_ad_ids'].astype(float))

In [75]:
source_id_popularity_cudf['source_id'].isnull().any()

False

In [38]:
publisher_popularity_cudf = get_popularity_cudf(train_set_cudf,['publisher_id'])
publisher_popularity_cudf = publisher_popularity_cudf[publisher_popularity_cudf['views'] > 10]
publisher_popularity_cudf = publisher_popularity_cudf[publisher_popularity_cudf['publisher_id'].isnull() != True]
publisher_popularity_cudf['confidence'] = get_confidence_sample_size( \
            publisher_popularity_cudf['views'] / \
            publisher_popularity_cudf['distinct_ad_ids'].astype(float))

In [39]:
advertiser_id_popularity_cudf = get_popularity_cudf(train_set_cudf,['advertiser_id'])
advertiser_id_popularity_cudf = advertiser_id_popularity_cudf[advertiser_id_popularity_cudf['views'] > 10]
advertiser_id_popularity_cudf = advertiser_id_popularity_cudf[advertiser_id_popularity_cudf['advertiser_id'].isnull() != True]
advertiser_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            advertiser_id_popularity_cudf['views'] / \
            advertiser_id_popularity_cudf['distinct_ad_ids'].astype(float))

In [40]:
campaign_id_popularity_cudf = get_popularity_cudf(train_set_cudf,['campaign_id'])
campaign_id_popularity_cudf = campaign_id_popularity_cudf[campaign_id_popularity_cudf['views'] > 10]
campaign_id_popularity_cudf = campaign_id_popularity_cudf[campaign_id_popularity_cudf['campaign_id'].isnull() != True]
campaign_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            campaign_id_popularity_cudf['views'] / \
            campaign_id_popularity_cudf['distinct_ad_ids'].astype(float))

In [41]:
train_aug_cat=train_set_cudf.merge(documents_categories_cudf,left_on='document_id_promo',right_on='document_id_cat',how='inner')

category_id_popularity_cudf = get_popularity_cudf(train_aug_cat,['category_id'],additional_output=[['avg_confidence_level_cat','confidence_level_cat','mean']])

category_id_popularity_cudf = category_id_popularity_cudf[category_id_popularity_cudf['views'] > 10]
category_id_popularity_cudf = category_id_popularity_cudf[category_id_popularity_cudf['category_id'].isnull() != True]
category_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            category_id_popularity_cudf['views'] / \
            category_id_popularity_cudf['distinct_ad_ids'].astype(float)) * \
            category_id_popularity_cudf['avg_confidence_level_cat']
categories_weights = documents_categories_cudf.merge(category_id_popularity_cudf,on='category_id')
categories_weights['popularity_weighted'] = categories_weights['ctr'] * categories_weights['confidence'] * categories_weights['confidence_level_cat']
categories_weights['confidence_weighted'] = categories_weights['confidence'] * categories_weights['confidence_level_cat']
categories_weights = categories_weights.groupby('document_id_cat').agg({'popularity_weighted':['sum'],'confidence_weighted':['sum','max']})
categories_weights['popularity'] = categories_weights['popularity_weighted']['sum'] /categories_weights['confidence_weighted']['sum']
categories_weights['confidence'] = categories_weights['confidence_weighted']['max']
categories_weights = categories_weights[['popularity','confidence']].reset_index()
categories_weights.columns = categories_weights.columns.get_level_values(0)
categories_weights

Unnamed: 0,document_id_cat,popularity,confidence
0,1,0.194951,0.021813
1,2,0.237396,0.020293
2,3,0.121363,0.039605
3,4,0.120381,0.069690
4,5,0.213627,0.048685
...,...,...,...
2828608,2999330,0.222853,0.017828
2828609,2999331,0.208737,0.008119
2828610,2999332,0.229040,0.014847
2828611,2999333,0.238811,0.006728


In [42]:
category_id_by_country_popularity_cudf = get_popularity_cudf(train_aug_cat,['event_country','category_id'],additional_output=[['avg_confidence_level_cat','confidence_level_cat','mean']])
category_id_by_country_popularity_cudf = category_id_by_country_popularity_cudf[category_id_by_country_popularity_cudf['views'] > 10]
category_id_by_country_popularity_cudf = category_id_by_country_popularity_cudf[category_id_by_country_popularity_cudf['category_id'].isnull() != True]
category_id_by_country_popularity_cudf = category_id_by_country_popularity_cudf.set_index(['event_country','category_id'])

In [43]:
train_aug_top = train_set_cudf.merge(documents_topics_cudf,left_on='document_id_promo',right_on='document_id_top',how='inner')
topic_id_popularity_cudf = get_popularity_cudf(train_aug_top,['topic_id'],additional_output=[['avg_confidence_level_top','confidence_level_top','mean']])
topic_id_popularity_cudf = topic_id_popularity_cudf[topic_id_popularity_cudf['views'] > 10]
topic_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            topic_id_popularity_cudf['views'] / \
            topic_id_popularity_cudf['distinct_ad_ids'].astype(float)) * \
            topic_id_popularity_cudf['avg_confidence_level_top']
topics_weights = documents_topics_cudf.merge(topic_id_popularity_cudf,on='topic_id')
topics_weights['popularity_weighted'] = topics_weights['ctr'] * topics_weights['confidence'] * topics_weights['confidence_level_top']
topics_weights['confidence_weighted'] = topics_weights['confidence'] * topics_weights['confidence_level_top']
topics_weights = topics_weights.groupby('document_id_top').agg({'popularity_weighted':['sum'],'confidence_weighted':['sum','max']})
topics_weights['popularity'] = topics_weights['popularity_weighted']['sum'] /topics_weights['confidence_weighted']['sum']
topics_weights['confidence'] = topics_weights['confidence_weighted']['max']
topics_weights = topics_weights[['popularity','confidence']].reset_index()
topics_weights.columns = topics_weights.columns.get_level_values(0)
topics_weights


Unnamed: 0,document_id_top,popularity,confidence
0,1,0.214295,0.000221
1,2,0.276231,0.000047
2,3,0.184072,0.000840
3,4,0.185568,0.000828
4,5,0.188762,0.000851
...,...,...,...
2488494,2999325,0.201087,0.000357
2488495,2999327,0.203246,0.000427
2488496,2999330,0.152047,0.000177
2488497,2999332,0.232653,0.000136


In [44]:
topic_id_by_country_popularity_cudf  = get_popularity_cudf(train_aug_top,['event_country','topic_id'],additional_output=[['avg_confidence_level_top','confidence_level_top','mean']])
topic_id_by_country_popularity_cudf  = topic_id_by_country_popularity_cudf[topic_id_by_country_popularity_cudf['views'] > 10]
topic_id_by_country_popularity_cudf = topic_id_by_country_popularity_cudf[topic_id_by_country_popularity_cudf['event_country'].isnull() != True]
topic_id_by_country_popularity_cudf = topic_id_by_country_popularity_cudf.set_index(['event_country','topic_id'])

In [45]:
train_aug_ent = train_set_cudf.merge(documents_entities_cudf,left_on='document_id_promo',right_on='document_id_ent',how='inner')

entity_id_popularity_cudf = get_popularity_cudf(train_aug_ent,['entity_id'],additional_output=[['avg_confidence_level_ent','confidence_level_ent','mean']])
entity_id_popularity_cudf  = entity_id_popularity_cudf[entity_id_popularity_cudf['views'] > 5]

entity_id_popularity_cudf['confidence'] = get_confidence_sample_size( \
            entity_id_popularity_cudf['views'] / \
            entity_id_popularity_cudf['distinct_ad_ids'].astype(float)) * \
            entity_id_popularity_cudf['avg_confidence_level_ent']
entities_weights = documents_entities_cudf.merge(entity_id_popularity_cudf,on='entity_id')
entities_weights['popularity_weighted'] = entities_weights['ctr'] * entities_weights['confidence'] * entities_weights['confidence_level_ent']
entities_weights['confidence_weighted'] = entities_weights['confidence'] * entities_weights['confidence_level_ent']
entities_weights = entities_weights.groupby('document_id_ent').agg({'popularity_weighted':['sum'],'confidence_weighted':['sum','max']})
entities_weights['popularity'] = entities_weights['popularity_weighted']['sum'] /entities_weights['confidence_weighted']['sum']
entities_weights['confidence'] = entities_weights['confidence_weighted']['max']
entities_weights = entities_weights[['popularity','confidence']].reset_index()
entities_weights.columns = entities_weights.columns.get_level_values(0)


In [46]:
entity_id_by_country_popularity_cudf = get_popularity_cudf(train_aug_ent,['event_country','entity_id'],additional_output=[['avg_confidence_level_ent','confidence_level_ent','mean']])
entity_id_by_country_popularity_cudf  = entity_id_by_country_popularity_cudf[entity_id_by_country_popularity_cudf['views'] > 5]
entity_id_by_country_popularity_cudf = entity_id_by_country_popularity_cudf[entity_id_by_country_popularity_cudf['event_country'].isnull() != True]
entity_id_by_country_popularity_cudf = entity_id_by_country_popularity_cudf.set_index(['event_country','entity_id'])

In [47]:
def get_percentiles(cudf, field, quantiles_levels=None, max_error_rate=0.0):
    if quantiles_levels == None:
        quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() 
    print(quantiles_levels)
    quantiles = cudf[field].quantile(quantiles_levels[0])
    return quantiles


In [48]:
publish_times_cudf = train_set_cudf[train_set_cudf['publish_time'].isnull()!=True]

publish_times_cudf = publish_times_cudf[['document_id_promo','publish_time']].drop_duplicates()
publish_times_cudf['publish_time']= (cudf.to_datetime(publish_times_cudf['publish_time']).astype(int) /10**9).astype(int)


In [49]:
publish_time_median_cudf = publish_times_cudf['publish_time'].median()

In [50]:
publish_time_median_cudf

1458.0

In [52]:
countries_utc_dst_dict =country_utc_dst_cudf.set_index('country_code')['utc_dst_time_offset_cleaned']
us_states_utc_dst_dict =us_states_utc_dst_cudf.set_index('state_abb')['utc_dst_time_offset_cleaned']
ca_states_utc_dst_dict =ca_states_utc_dst_cudf.set_index('state_abb')['utc_dst_time_offset_cleaned']

In [53]:
TIME_DECAY_ALPHA = 0.0005
ref_dates = [
                1476714880, # 7 days
                1474727680, # 30 days
                1469370880, # 90 days
                1461508480,  # 180 days
                1445697280, # 1 year
                1414161280 # 2 years
]
# ### Get local time
DEFAULT_TZ_EST = -7.0
def get_local_utc_bst_tz(event_country, event_country_state):
    print(event_country)
    local_tz = DEFAULT_TZ_EST
    if len(event_country) > 0:
        if event_country in countries_utc_dst_dict:
            print(event_country)
            local_tz = countries_utc_dst_dict[event_country]
            print(len(event_country_state))
            if len(event_country_state)>2:
                state = event_country_state[3:5]
                print(state)
                if event_country == 'US':  
                    if state in us_states_utc_dst_dict:
                        local_tz = us_states_utc_dst_dict[state]                
                elif event_country == 'CA':
                    if state in ca_states_utc_dst_dict:
                        local_tz = ca_states_dst_dict[state] 
    return float(local_tz)

def get_hour_bin(hour):
    if hour >= 5 and hour < 8:
        hour_bin = 0
    elif hour >= 8 and hour < 11:
        hour_bin = 1
    elif hour >= 11 and hour < 14:
        hour_bin = 2
    elif hour >= 14 and hour < 19:
        hour_bin = 3
    elif hour >= 19 and hour < 22:
        hour_bin = 4
    else:
        hour_bin = 5
    return hour_bin


In [102]:
ad_id_popularity_cudf

Unnamed: 0,ad_id,clicks,views,ctr,distinct_ad_ids,confidence
337,139682,5,9,0.555556,1,0.138629
470,173005,2,6,0.333333,1,0.117155
481,174547,0,6,0.0,1,0.117155


In [54]:
FV = train_set_cudf.merge(ad_id_popularity_cudf[['ad_id','views','ctr','confidence']].rename(columns={'views':'ad_views','ctr':'pop_ad_id','confidence':'pop_ad_id_conf'}), left_on='ad_id',right_on='ad_id',how='left')

In [55]:
#Adding doc_views and popularity

In [56]:
FV = FV.merge(document_id_popularity_cudf[['document_id_promo','views','ctr','confidence']].rename(columns={'views':'doc_views','ctr':'pop_document_id','confidence':'pop_document_id_conf'}), left_on='document_id_promo',right_on='document_id_promo',how='left')

In [57]:
#Note here that user_views comes from user profiles, so is not a viable option immediately

# doc_ad_days_since_published and doc_event_days_since_published need to be fixed

In [58]:
#Add doc_ad_days_since_published


In [59]:
TIMESTAMP_DELTA = 1465876799998
FV['dt_timestamp_event'] = cudf.to_datetime(FV['timestamp_event'].astype(int)+TIMESTAMP_DELTA,unit='ms')
FV['doc_ad_days_since_published'] = FV['dt_timestamp_event'].astype(int) //  (1000000000 * 86400) - FV['publish_time'].astype(int) // (1000 * 86400)

In [60]:
# Add doc_event_days_since_published

In [61]:
FV['doc_event_days_since_published'] = FV['dt_timestamp_event'].astype(int) //  (1000000000 * 86400) - FV['publish_time_doc_event'].astype(int) // (1000 * 86400)
print(FV['doc_event_days_since_published'].sum())
FV['doc_event_days_since_published'] = FV['doc_event_days_since_published'].where((FV['doc_event_days_since_published'] >= 0) & (FV['doc_event_days_since_published'] <= 365*10),other=0)
print(FV['doc_event_days_since_published'].sum())

11897299.0
11006617.0


# Weekend and event_hour need to be fixed

In [62]:
#Check if weekend

In [63]:
FV['event_weekend'] = FV['dt_timestamp_event'].dt.weekday > 4

In [64]:



FV['tz_area'] = FV['event_country']
FV['tz_area'] = FV['tz_area'].where(~FV['event_country'].isin(['CA','US']),other=FV['event_country_state'])

FV = FV.merge(geo_code,left_on='tz_area',right_on='geo_code',how='left')



In [65]:
FV['event_hour'] = ((FV['dt_timestamp_event'].dt.hour + FV['utc_dst_time_offset_cleaned'])%24).applymap(get_hour_bin)

In [66]:
# Changing categorical countries to less value if below threshold (CATEGORIFY)

In [67]:
FV['event_country'].loc[~FV['event_country'].isin(event_country_values_counts['event_country'])] = 'less'

In [68]:
# Changing categorical states to less value if below threshold (CATEGORIFY)

In [69]:
FV['event_country_state'].loc[~FV['event_country_state'].isin(event_country_state_values_counts['event_country_state'])] = 'less'

In [70]:
# Changing categorical geo_location to less value if below threshold (CATEGORIFY)

In [71]:
FV['geo_location_event'].loc[~FV['geo_location_event'].isin(event_geo_location_values_counts['geo_location_event'])] = 'less'
FV.rename(columns={'geo_location_event':'event_geo_location'},inplace=True)

In [72]:
#Readjusting traffic source per source

In [73]:
FV['traffic_source'] = FV['traffic_source_pv'] -1

In [74]:
#Readjusting event platform

In [75]:
FV['event_platform'] = FV['platform_event'] -1

In [76]:
# column renaming

In [77]:
publisher_popularity_cudf

Unnamed: 0,publisher_id,clicks,views,ctr,distinct_ad_ids,confidence
0,5,13,36,0.361111,7,0.109291
2,9,11,38,0.289474,30,0.049267
5,26,110,452,0.243363,29,0.169093
8,37,24,86,0.279070,23,0.093672
9,42,65,143,0.454545,4,0.216991
...,...,...,...,...,...,...
1572,5674,101,422,0.239336,31,0.161467
1581,5765,2,21,0.095238,1,0.186099
1583,5775,2,12,0.166667,9,0.051012
1592,5881,9,104,0.086538,2,0.239035


In [78]:
FV = FV.merge(source_id_popularity_cudf[['source_id','ctr','confidence']].rename(columns={'ctr':'pop_source_id','confidence':'pop_source_id_conf'}), left_on='source_id',right_on='source_id',how='left')

In [79]:
FV = FV.merge(publisher_popularity_cudf[['publisher_id','ctr','confidence']].rename(columns={'ctr':'pop_publisher_id','confidence':'pop_publisher_id_conf'}), left_on='publisher_id',right_on='publisher_id',how='left')

In [80]:
FV = FV.merge(advertiser_id_popularity_cudf[['advertiser_id','ctr','confidence']].rename(columns={'ctr':'pop_advertiser_id','confidence':'pop_advertiser_id_conf'}), left_on='advertiser_id',right_on='advertiser_id',how='left')

In [81]:
FV = FV.merge(campaign_id_popularity_cudf[['campaign_id','ctr','confidence']].rename(columns={'ctr':'pop_campain_id','confidence':'pop_campain_id_conf'}), left_on='campaign_id',right_on='campaign_id',how='left')
FV['pop_campain_id_conf_multipl'] = FV['pop_campain_id'] * FV['pop_campain_id_conf']

In [82]:
# WE EXCLUDE THESE FEATURES FOR NOW

FV = FV.merge(categories_weights.rename(columns={'popularity':'pop_category_id','confidence':'pop_category_id_conf'}),left_on='document_id_promo', right_on='document_id_cat',how='left')

In [83]:
FV = FV.merge(topics_weights.rename(columns={'popularity':'pop_topic_id','confidence':'pop_topic_id_conf'}),left_on='document_id_promo', right_on='document_id_top',how='left')

In [84]:
FV = FV.merge(entities_weights.rename(columns={'popularity':'pop_entity_id','confidence':'pop_entity_id_conf'}),left_on='document_id_promo', right_on='document_id_ent',how='left')

In [85]:
FV.rename(columns={'sourceentities_similarity_id':'doc_ad_source_id',
                   'publisher_id':'doc_ad_publisher_id',
                   'source_id_doc_event':'doc_event_source_id',
                   'publisher_doc_event':'doc_event_publisher_id',
                   'document_id_event':'doc_event_id',
                   'document_id_promo':'doc_id',
                   'advertiser_id':'ad_advertiser',
                   
                   },inplace=True)

In [86]:
reference = cudf.read_csv('/outbrain/train_base.csv')

# This is all testing

In [87]:
def checkdiff(field):
    print(FV[field].sum())
    print(reference[field].sum())

In [88]:
reference = cudf.read_csv('/outbrain/train_base.csv')

In [89]:
checkdiff('pop_document_id_conf')

15851.235199155537
15851.235199155537


In [92]:
categories= cupy.sparse.coo_matrix(
    (cupy.ones(len(df)), (df['document_id_cat'].values, df['category_id'].values),)
)

NameError: name 'cupy' is not defined

In [94]:
import cupy
import nvtabular as nvt

In [91]:
train_set_cudf[['document_id_promo','document_id_event']]

Unnamed: 0,document_id_promo,document_id_event
0,1328059,1780636
1,1145458,1780636
2,1563558,1780636
3,54197,1780636
4,1647612,1780636
...,...,...
99994,1659018,1179111
99995,691081,1179111
99996,1160307,1722769
99997,864899,1722769


In [95]:
from nvtabular.column_similarity import ColumnSimilarity

categories = cupy.sparse.coo_matrix((cupy.ones(len(documents_categories_cudf)), (documents_categories_cudf.document_id_cat.values, documents_categories_cudf.category_id.values)))
op = ColumnSimilarity("cosine_similarity", "document_id_event", categories, "document_id_promo", metric='tfidf', on_device=False)
df = op.apply_op(train_set_cudf[['document_id_promo','document_id_event']], None, None)


  idf = np.log(N / np.bincount(X.col))


In [99]:
df['cosine_similarity'].sum()

14539.231520924599

In [100]:
reference['doc_event_doc_ad_sim_categories'].sum()

21723.042682176827

In [160]:
df = cudf.read_csv("/outbrain/orig_100k/documents_categories.csv")
categories= cupy.sparse.coo_matrix(
    (cupy.ones(len(df)), (df['document_id'].values, df['category_id'].values),)
)

In [210]:
(df['cosine_similarity'] == 0).sum()

78585