<b>Author: Ronay Ak</b>

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [2]:
import cudf
import pandas as pd
import cupy 
import math
import time
from datetime import datetime
import numpy as np
import pickle

In [3]:
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "./dataset/"
#SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

Here some csv files are subsets of the originals due to large dataset size.

In [4]:
# for now we skip the validation
evaluation = False

Start loading csv files..

In [5]:
documents_meta_df = cudf.read_csv(DATA_BUCKET_FOLDER+ 'documents_meta.csv', dtype=['int32','int32','int32','str'], 
                                names=['document_id_doc', 'source_id', 'publisher_id', 'publish_time'], header=0)
print(documents_meta_df.shape)
documents_meta_df.dtypes

(2999334, 4)


document_id_doc     int32
source_id           int32
publisher_id        int32
publish_time       object
dtype: object

In [6]:
# APPLY DROPNA
print('Drop rows with empty "source_id"...')
documents_meta_df = documents_meta_df.dropna(subset=['source_id'])

source_publishers_df = documents_meta_df[["source_id", "publisher_id"]].drop_duplicates()

#replace drop_duplicates with groupby
# source_publishers_df = documents_meta_df.groupby(['source_id', "publisher_id"], as_index = False, dropna=False).count().drop(['document_id_doc', 'publish_time'])
source_publishers_df.shape

Drop rows with empty "source_id"...


(14394, 2)

In [7]:
source_publishers_df.head(2)

Unnamed: 0,source_id,publisher_id
0,1,603
12867,2,85


In [8]:
# get list of source_ids without publisher_id
source_ids_without_publisher_df = source_publishers_df.loc[source_publishers_df.loc[:, 'publisher_id'].isnull()]
new_publishers_df = source_ids_without_publisher_df.loc[:, ['source_id']]
print(new_publishers_df.shape)
new_publishers_df.head(2)

(5058, 1)


Unnamed: 0,source_id
29846,7
29853,8


In [9]:
source_ids_without_publisher_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,
29853,8,
30995,16,
33712,20,
36964,26,


In [10]:
print('Maximum value of publisher_id used so far...')
max_pub =source_publishers_df.dropna(subset='publisher_id')['publisher_id'].max()
max_pub

Maximum value of publisher_id used so far...


1263

NOTE: "source_id" column of new_publishers_df generated by spark is not in order. Here, with cudf it is in order.

In [11]:
# maximum value of publisher_id used so far
new_publisher_cnt = source_ids_without_publisher_df.shape[0]
# rows filled with new publisher_ids
new_publishers_df['publisher_id'] = np.arange(max_pub + 1, max_pub + 1 + new_publisher_cnt)
new_publishers_df.shape

(5058, 2)

In [12]:
new_publishers_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,1264
29853,8,1265
30995,16,1266
33712,20,1267
36964,26,1268


In [13]:
# old and new publishers merged
fixed_source_publishers_df = source_publishers_df.dropna(subset='publisher_id')
fixed_source_publishers_df = cudf.concat([fixed_source_publishers_df, new_publishers_df])

In [14]:
fixed_source_publishers_df.head(2)

Unnamed: 0,source_id,publisher_id
0,1,603
12867,2,85


In [15]:
#no null values
fixed_source_publishers_df.isnull().any()

source_id       False
publisher_id    False
dtype: bool

In [16]:
print('Update documents_meta with new publishers...')

documents_meta_df = documents_meta_df.drop(columns='publisher_id').merge(fixed_source_publishers_df, on='source_id')
documents_meta_df.head()

Update documents_meta with new publishers...


Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
0,1434716,12,2016-05-13 14:00:00,594
1,1470964,12,2016-05-20 03:00:00,594
2,1459969,12,2016-05-18 18:00:00,594
3,1316610,12,2015-06-18 00:00:00,594
4,1425170,12,2016-05-11 00:00:00,594


In [17]:
documents_meta_df[documents_meta_df['document_id_doc']==714935]

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
1304507,714935,1595,2015-05-22 00:00:00,420


# Calculate IDF for categories

In [18]:
documents_categories_df =  cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_categories.csv',
    names=['document_id_cat', 'category_id', 'confidence_level_cat'],
    dtype=['int64', 'int64','float32'], header=0)

documents_categories_df.head(2)

Unnamed: 0,document_id_cat,category_id,confidence_level_cat
0,1595802,1611,0.92
1,1595802,1610,0.07


In [19]:
documents_categories_df.shape

(5481475, 3)

In [20]:
# calculate the number of documents in the corpus
documents_total = documents_meta_df.shape[0]
print(documents_total)

2996816


In our case TF is always 1. 

In [21]:
%%time
#Document Frequency for the category
TF =1 

# groupby on category_id, and count the num of categories

categories_docs_counts = documents_categories_df.groupby('category_id').size().rename('count').reset_index()

#calculate TF*IDFs
categories_docs_counts['categories']=np.log(documents_total/(categories_docs_counts['count']+1))

CPU times: user 20.8 ms, sys: 381 µs, total: 21.2 ms
Wall time: 20.4 ms


In [22]:
categories_docs_counts.head(5)

Unnamed: 0,category_id,count,categories
0,1000,5074,6.380979
1,1100,212249,2.647541
2,1200,7,12.833619
3,1202,3259,6.823578
4,1203,30511,4.587186


In [23]:
#merge doc_meta_categories_merged and categories_docs_counts 
doc_category_idf = documents_categories_df.merge(categories_docs_counts, how='inner', on='category_id')
print(doc_category_idf.shape)
doc_category_idf.head()

(5481475, 5)


Unnamed: 0,document_id_cat,category_id,confidence_level_cat,count,categories
0,1166221,1807,0.92,43922,4.222868
1,1166221,1608,0.07,57479,3.953869
2,1266801,1608,0.92,57479,3.953869
3,1266801,1807,0.07,43922,4.222868
4,1211101,1807,0.92,43922,4.222868


In [24]:
doc_category_idf.to_parquet(OUTPUT_BUCKET_FOLDER+'doc_category_idf.parquet')

# Calculate IDF for topics

In [25]:
#"Reading documents_topics...
documents_topics_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_topics.csv',
    names=['document_id_top', 'topic_id', 'confidence_level_top'],
    dtype={'document_id_top': 'int64','topic_id': 'int64','confidence_level_top': 'float32'},
    skiprows=1)

In [26]:
documents_topics_df.head()

Unnamed: 0,document_id_top,topic_id,confidence_level_top
0,1595802,140,0.073113
1,1595802,16,0.059416
2,1595802,143,0.045421
3,1595802,170,0.038867
4,1524246,113,0.19645


In [27]:
documents_topics_df.isnull().any()

document_id_top         False
topic_id                False
confidence_level_top    False
dtype: bool

In [28]:
documents_topics_df['document_id_top'].nunique()

184064

In [29]:
%%time
#Document Frequency for the category
TF =1 
# groupby on topic_id, and count the num of categories.
topics_docs_counts = documents_topics_df.groupby('topic_id').size().rename('count').reset_index()
#calculcate IDFs
topics_docs_counts['topics']=np.log(documents_total/(topics_docs_counts['count']+1))

CPU times: user 16.6 ms, sys: 290 µs, total: 16.9 ms
Wall time: 16.2 ms


In [30]:
topics_docs_counts.head()

Unnamed: 0,topic_id,count,topics
0,0,1734,7.454298
1,1,3400,6.781236
2,2,5881,6.233409
3,3,264,9.333331
4,4,819,8.203757


In [31]:
#merge doc_meta_categories_merged and categories_docs_counts 
doc_topics_idf = documents_topics_df.merge(topics_docs_counts, how='inner', on='topic_id')
doc_topics_idf.head()

Unnamed: 0,document_id_top,topic_id,confidence_level_top,count,topics
0,625973,140,0.026515,22665,4.88444
1,625973,198,0.010992,6103,6.196361
2,625973,108,0.008011,3194,6.843719
3,694406,113,0.193123,11599,5.554301
4,694406,260,0.165055,18135,5.107407


In [32]:
doc_topics_idf.to_parquet(OUTPUT_BUCKET_FOLDER+'doc_topics_idf.parquet')

# calcualte TF- IDF for entities

In [33]:
#Reading documents_entities..."
documents_entities_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_entities.csv',
    names=['document_id_ent', 'entity_id', 'confidence_level_ent'],
    dtype={'document_id_ent': 'int64','entity_id': 'str','confidence_level_ent': 'float32'},
    skiprows=1)

In [34]:
%%time
#Document Frequency for the category
TF =1 
# groupby on topic_id, and count the num of categories.
entities_docs_counts = documents_entities_df.groupby('entity_id').size().rename('count').reset_index()

#calculcate IDFs
entities_docs_counts['entities']=np.log(documents_total/(entities_docs_counts['count']+1))

CPU times: user 28.8 ms, sys: 28.6 ms, total: 57.4 ms
Wall time: 68 ms


In [35]:
entities_docs_counts.head()

Unnamed: 0,entity_id,count,entities
0,000009e46a0ce7ce4a1692b3ae016425,1,14.219914
1,000067b4ed70ce24f928cc4db2ec9c46,2,13.814449
2,00007d2d430fb184718f9beb183ca857,1,14.219914
3,0000c3af6a63e9c42c35215bf47f3ae5,8,12.715836
4,0000f6ee06a6525d954484dfae01ee82,18,11.968622


In [36]:
#merge doc_meta_categories_merged and categories_docs_counts 
doc_entities_idf = documents_entities_df.merge(entities_docs_counts, how='inner', on='entity_id')
doc_entities_idf.head()

Unnamed: 0,document_id_ent,entity_id,confidence_level_ent,count,entities
0,356480,191b73bc928fcdf21c29ba43ce90dd94,0.38864,1,14.219914
1,356480,b095704f91399e30c31d8ba5c72805e3,0.339668,7,12.833619
2,356480,e52fdc9244e521b17f54843db8873faf,0.253893,2,13.814449
3,356480,c19e072898f1d9c05ba08351636ada98,0.249627,14,12.205011
4,369988,f73f3befe92d0a167d3a0f50b7c696c1,0.532634,253,9.375727


In [37]:
doc_entities_idf.to_parquet(OUTPUT_BUCKET_FOLDER+'doc_entities_idf.parquet')

In [38]:
# ### Left join: each document has documents_meta, but some have no categories, topic or entity

# documents_df1 = documents_meta_df.merge(doc_category_idf, how='left', left_on='document_id_doc', right_on='document_id_cat')

# documents_df2 = documents_df1.merge(doc_topics_idf, how='left', left_on='document_id_doc', right_on='document_id_top')

# documents_df = documents_df2.merge(doc_entities_idf, how='left', left_on='document_id_doc', right_on='document_id_ent')

In [39]:
documents_df= documents_meta_df.copy(deep=False)

In [40]:
documents_df.shape

(2996816, 4)

In [41]:
documents_df.head()

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
0,1434716,12,2016-05-13 14:00:00,594
1,1470964,12,2016-05-20 03:00:00,594
2,1459969,12,2016-05-18 18:00:00,594
3,1316610,12,2015-06-18 00:00:00,594
4,1425170,12,2016-05-11 00:00:00,594


In [42]:
evaluation = False
if evaluation:
    validation_set_df = cudf.read_parquet(args.output_directory+"validation_set.parquet")

    users_to_profile = validation_set_df.loc[:, ['uuid_event']].drop_duplicates()

    validation_user_docs_to_ignore = validation_set_df.loc[:, ['uuid_event', 'document_id_promo']].drop_duplicates().rename(
        columns={'uuid_event': 'uuid_pv', 'document_id_promo': 'document_id_pv'})
else:
    events_df = cudf.read_csv(DATA_BUCKET_FOLDER+'events.csv', dtype=['int32','str','int32','int32','int32','str'], 
                                names=['display_id', 'uuid_event', 'document_id_event', 'timestamp_event', 'platform_event', 'geo_location_event'], header=0, na_values=['\\N', ''],
    keep_default_na=False,
    skiprows=1)
   
    events_df['event_country'] = events_df['geo_location_event'].str.slice(0,2)
    events_df['day_event'] = (events_df['timestamp_event'] / 1000 / 60 / 60 / 24).astype(int)
    # Drop rows with empty "geo_location"
    events_df = events_df.dropna(subset="geo_location_event")
   
    # Drop rows with empty "platform"
    events_df = events_df.dropna(subset="platform_event")
    
    promoted_content_df = cudf.read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv', dtype=['int32', 'int32','int32','int32'], 
                                names=['ad_id', 'document_id_promo', 'campaign_id', 'advertiser_id'], header=0)

    clicks_test_df = cudf.read_csv(DATA_BUCKET_FOLDER+'clicks_test.csv', dtype=['int32', 'int32'], 
                                names=['display_id', 'ad_id'], header=0)
    test_set_df1 = clicks_test_df.merge(promoted_content_df, on='ad_id', how='left')
    test_set_df = test_set_df1.merge(events_df, on='display_id', how='left')
    users_to_profile = test_set_df['uuid_event'].drop_duplicates().reset_index(drop=True).to_frame()
    test_users_docs_timestamp_to_ignore = test_set_df[['uuid_event', 'document_id_promo', 'timestamp_event']].drop_duplicates().reset_index(drop=True)

In [43]:
users_to_profile.head()

Unnamed: 0,uuid_event
0,
1,10000b6df44162
2,1001b1687f2e73
3,10020e2d1802e9
4,100275aabf71b3


In [44]:
events_df.head(2)

Unnamed: 0,display_id,uuid_event,document_id_event,timestamp_event,platform_event,geo_location_event,event_country,day_event
0,2,79a85fa78311b9,1794259,81,2,US>CA>807,US,0
1,3,822932ce3d8757,1179111,182,2,US>MI>505,US,0


In [45]:
#smaller set than the original size
page_views_df = cudf.read_csv(DATA_BUCKET_FOLDER + "page_views.csv",
    names=['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv', 'geo_location_pv', 'traffic_source_pv'],
    dtype={'uuid_pv': 'str', 'document_id_pv': 'int64', 'timestamp_pv': 'int64',
           'platform_pv': 'int64', 'geo_location_pv': 'str', 'traffic_source_pv': 'int64'},
    skiprows=1,
    keep_default_na=False)

In [46]:
page_views_df.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2


In [47]:
# first apply (SELECT uuid_event FROM users_to_profile WHERE uuid_event = p.uuid_pv)
page_views_train_df1 = page_views_df.merge(users_to_profile, how='inner', left_on=['uuid_pv'], right_on = ['uuid_event']).drop(columns=['uuid_event'])
print(page_views_train_df1.shape)
page_views_train_df1.head(2)

(43810, 6)


Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,8f5685b4ea8de6,25792,42937578,1,US>CA>803,1
1,75278f75fd80ae,25792,61170710,1,US>NJ>504,1


In [48]:
page_views_train_df1['id']=range(1,page_views_train_df1.shape[0]+1)
page_views_train_df1 = page_views_train_df1.set_index('id').reset_index()
page_views_train_df1.head()

Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1,8f5685b4ea8de6,25792,42937578,1,US>CA>803,1
1,2,75278f75fd80ae,25792,61170710,1,US>NJ>504,1
2,3,1f216b99537a63,25792,28582318,1,US>TX>618,1
3,4,f78fd19c40e614,25792,46714915,1,US>LA>716,2
4,5,8ae7e9f764b528,25792,34958035,3,US>OR>801,1


In [49]:
# apply SELECT uuid_event FROM test_users_docs_timestamp_to_ignore  WHERE uuid_event = p.uuid_pv 
# AND document_id_promo = p.document_id_pv
page_views_joined_df1 = page_views_train_df1.merge(test_users_docs_timestamp_to_ignore, how='inner', left_on=['uuid_pv','document_id_pv'], right_on = ['uuid_event','document_id_promo'])
print(page_views_joined_df1.shape)

page_views_joined_df1= page_views_joined_df1[page_views_joined_df1.timestamp_pv >= page_views_joined_df1.timestamp_event].drop(columns=['uuid_event','document_id_promo', 'timestamp_event'])
print(page_views_joined_df1.shape)
page_views_joined_df1.head()

(724, 10)
(723, 7)


Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,15046,1118d78752a6a0,1758447,53905549,1,US>CA>862,1
1,19896,18ad2d88dc0f5d,1649150,56591321,2,US>NJ>501,1
2,2304,1c2498998aed00,1552983,22328151,2,US>VA>511,1
3,19848,106e47d7a3d3c2,1649150,59539770,2,US>VA>511,1
4,27200,184c61fe87f265,1649400,3614613,1,US>FL>534,1


In [50]:
page_views_train_df2 = page_views_train_df1.merge(page_views_joined_df1, how='left', on='id')

page_views_train_df2 = page_views_train_df2[page_views_train_df2['uuid_pv_y'].isnull()]
page_views_train_df2=page_views_train_df2[['uuid_pv_x', 'document_id_pv_x', 'timestamp_pv_x', 'platform_pv_x', 'geo_location_pv_x', 'traffic_source_pv_x']].rename(
    columns={'uuid_pv_x': 'uuid_pv', 'document_id_pv_x': 'document_id_pv', 'timestamp_pv_x': 'timestamp_pv', 'platform_pv_x':'platform_pv', 'geo_location_pv_x':'geo_location_pv',
            'traffic_source_pv_x':'traffic_source_pv'})

page_views_train_df2.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,b8d28005a8aa6c,60164,35341632,1,US>GA>525,1
1,924ae0f5cccb99,60164,2436947,1,US>GA>530,1


In [51]:
documents_df.head(2)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
0,1434716,12,2016-05-13 14:00:00,594
1,1470964,12,2016-05-20 03:00:00,594


In [52]:
page_views_train_df = page_views_train_df2.merge(documents_df,left_on='document_id_pv', right_on='document_id_doc', how='left') 
print(page_views_train_df.shape)
page_views_train_df.head(2)

(43087, 10)


Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,document_id_doc,source_id,publish_time,publisher_id
0,e77a43f2b28254,25792,36324077,1,US>NY>501,2,25792,10517,2011-08-04 10:00:00,240
1,914d976a9cc043,25792,43657733,1,US>FL>534,1,25792,10517,2011-08-04 10:00:00,240


In [53]:
page_views_train_df.columns

Index(['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv',
       'geo_location_pv', 'traffic_source_pv', 'document_id_doc', 'source_id',
       'publish_time', 'publisher_id'],
      dtype='object')

In [54]:
print('Processing user profiles...')

null_to_minus_one = lambda x: -1 if np.isnan(x) else x
null_to_zero = lambda x: 0 if np.isnan(x) else x

Processing user profiles...


In [55]:
page_views_train_df.fillna({'timestamp_pv': -1}, inplace=True)

In [56]:
page_views_train_df.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,document_id_doc,source_id,publish_time,publisher_id
0,e77a43f2b28254,25792,36324077,1,US>NY>501,2,25792,10517,2011-08-04 10:00:00,240
1,914d976a9cc043,25792,43657733,1,US>FL>534,1,25792,10517,2011-08-04 10:00:00,240
2,aae35dd5ea3c33,25792,1167439,1,AU>04,1,25792,10517,2011-08-04 10:00:00,240
3,9057ee11931c5b,25792,40065247,1,US>CA>803,1,25792,10517,2011-08-04 10:00:00,240
4,2e92ea3f3c5941,25792,51216385,1,US>MI>553,2,25792,10517,2011-08-04 10:00:00,240


In [99]:
users_profile= cudf.DataFrame()
users_profile['uuid'] = page_views_train_df.loc[:, 'uuid_pv']
users_profile['doc_ids'] = page_views_train_df.loc[:, 'document_id_pv']

In [100]:
users_profile.head()

Unnamed: 0,uuid,doc_ids
0,e77a43f2b28254,25792
1,914d976a9cc043,25792
2,aae35dd5ea3c33,25792
3,9057ee11931c5b,25792
4,2e92ea3f3c5941,25792


In [114]:
views = page_views_train_df.groupby(['uuid_pv'], as_index=False)['document_id_pv'].count()
views = views.reset_index()
#users_profile=users_profile.rename(columns={'uuid_pv':'uuid'})
print(views.shape)

(35215, 2)


In [115]:
#we can keep the views as a seperate df.
views.head()

Unnamed: 0,uuid_pv,document_id_pv
0,100614d664a6c3,1
1,100ce200bea586,1
2,100d0ddfadd845,1
3,100fd99bcbcb85,1
4,100fdd39838d25,1


In [336]:
# ## Write user profiles to pickle

if evaluation:
    table_name = 'user_profiles_eval.parquet'
else:
    table_name = 'user_profiles.parquet'

users_profile_df.to_pickle(OUTPUT_BUCKET_FOLDER+table_name)