<b>Author: Ronay Ak</b>

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import cudf
import pandas as pd
import cupy 
import math
import time
from datetime import datetime
import numpy as np
import pickle

In [3]:
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "./dataset/"
#SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

Here some csv files are subsets of the originals due to large dataset size.

In [4]:
# for now we skip the validation
evaluation = False

Start loading csv files..

In [5]:
documents_meta_df = cudf.read_csv(DATA_BUCKET_FOLDER+ 'documents_meta.csv', dtype=['int32','int32','int32','str'], 
                                names=['document_id_doc', 'source_id', 'publisher_id', 'publish_time'], header=0)
print(documents_meta_df.shape)
documents_meta_df.dtypes

(2999334, 4)


document_id_doc     int32
source_id           int32
publisher_id        int32
publish_time       object
dtype: object

In [6]:
# APPLY DROPNA
print('Drop rows with empty "source_id"...')
documents_meta_df = documents_meta_df.dropna(subset=['source_id'])

source_publishers_df = documents_meta_df[["source_id", "publisher_id"]].drop_duplicates()

#replace drop_duplicates with groupby
# source_publishers_df = documents_meta_df.groupby(['source_id', "publisher_id"], as_index = False, dropna=False).count().drop(['document_id_doc', 'publish_time'])
source_publishers_df.shape

Drop rows with empty "source_id"...


(14394, 2)

In [7]:
source_publishers_df.head(2)

Unnamed: 0,source_id,publisher_id
0,1,603
12867,2,85


In [8]:
# get list of source_ids without publisher_id
source_ids_without_publisher_df = source_publishers_df.loc[source_publishers_df.loc[:, 'publisher_id'].isnull()]
new_publishers_df = source_ids_without_publisher_df.loc[:, ['source_id']]
print(new_publishers_df.shape)
new_publishers_df.head(2)

(5058, 1)


Unnamed: 0,source_id
29846,7
29853,8


In [9]:
source_ids_without_publisher_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,
29853,8,
30995,16,
33712,20,
36964,26,


In [10]:
print('Maximum value of publisher_id used so far...')
max_pub =source_publishers_df.dropna(subset='publisher_id')['publisher_id'].max()
max_pub

Maximum value of publisher_id used so far...


1263

NOTE: "source_id" column of new_publishers_df generated by spark is not in order. Here, with cudf it is in order.

In [11]:
# maximum value of publisher_id used so far
new_publisher_cnt = source_ids_without_publisher_df.shape[0]
# rows filled with new publisher_ids
new_publishers_df['publisher_id'] = np.arange(max_pub + 1, max_pub + 1 + new_publisher_cnt)
new_publishers_df.shape

(5058, 2)

In [12]:
new_publishers_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,1264
29853,8,1265
30995,16,1266
33712,20,1267
36964,26,1268


In [13]:
# old and new publishers merged
fixed_source_publishers_df = source_publishers_df.dropna(subset='publisher_id')
fixed_source_publishers_df = cudf.concat([fixed_source_publishers_df, new_publishers_df])

In [14]:
fixed_source_publishers_df.head(2)

Unnamed: 0,source_id,publisher_id
0,1,603
12867,2,85


In [15]:
#no null values
fixed_source_publishers_df.isnull().any()

source_id       False
publisher_id    False
dtype: bool

In [16]:
print('Update documents_meta with new publishers...')

documents_meta_df = documents_meta_df.drop(columns='publisher_id').merge(fixed_source_publishers_df, on='source_id')
documents_meta_df.head()

Update documents_meta with new publishers...


Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
0,1128855,31,2016-03-02 12:00:00,535
1,1086925,31,2016-02-10 07:00:00,535
2,1086924,31,2016-02-10 07:00:00,535
3,1104686,31,2016-02-19 07:00:00,535
4,1027324,31,2016-01-05 12:00:00,535


In [17]:
documents_categories_df =  cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_categories.csv',
    names=['document_id_cat', 'category_id', 'confidence_level_cat'],
    dtype=['int64', 'int64','float32'], header=0)

documents_categories_df.head(2)

Unnamed: 0,document_id_cat,category_id,confidence_level_cat
0,1595802,1611,0.92
1,1595802,1610,0.07


In [18]:
# Merge documents_meta and documents_categories
doc_categories_merged = documents_meta_df.merge(documents_categories_df, how='inner', left_on='document_id_doc', right_on='document_id_cat')

In [19]:
doc_categories_merged.head(5)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,document_id_cat,category_id,confidence_level_cat
0,88851,12,2012-07-30 08:00:00,594,88851,1608,0.07
1,36742,12,2009-06-30 00:00:00,594,36742,1808,0.92
2,36742,12,2009-06-30 00:00:00,594,36742,1513,0.07
3,36857,12,2009-07-15 00:00:00,594,36857,1806,0.92
4,36857,12,2009-07-15 00:00:00,594,36857,1808,0.07


In [20]:
doc_categories_merged.shape, documents_categories_df.shape, documents_meta_df.shape

((5477347, 7), (5481475, 3), (2996816, 4))

In our case TF is always 1. 

In [22]:
#Document Frequency for the category
N_docs=doc_categories_merged['document_id_doc'].nunique()
print(N_docs)
TF =1 

# groupby on category_id, and count the num of categories.
categories_docs_counts = doc_categories_merged.groupby('category_id').size().rename('count').reset_index()

#calculcate IDFs
categories_docs_counts['idf']=np.log(N_docs/(categories_docs_counts['count']+1))

2826584


In [23]:
categories_docs_counts.head(5)

Unnamed: 0,category_id,count,idf
0,1000,5074,6.322498
1,1100,212094,2.58979
2,1200,6,12.908669
3,1202,3259,6.765097
4,1203,30491,4.52936


In [24]:
# N = doc_meta_categories_merged.document_id_doc.nunique()
# idf = cupy.log(N) - cupy.log1p(cupy.bincount(doc_meta_categories_merged.category_id.values))
# doc_meta_categories_merged['new_idf'] = idf[doc_meta_categories_merged.category_id.values]

In [31]:
doc_meta_categories_merged.head(2)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,category_id,confidence_level_cat,new_idf
0,88851,12,2012-07-30 08:00:00,594,1608,0.07,3.897808
1,36742,12,2009-06-30 00:00:00,594,1808,0.92,2.458759


In [29]:
#calculate norm_tf_idf
#merge doc_meta_categories_merged and categories_docs_counts 
doc_category_idf = doc_meta_categories_merged.merge(categories_docs_counts, how='inner', on='category_id').drop(columns=['publish_time', 'publisher_id'], inplace=True)
doc_category_idf.head()

Unnamed: 0,document_id_doc,source_id,category_id,confidence_level_cat,count,idf
0,447567,1,1807,0.92,43846,4.166118
1,447567,1,1608,0.07,57340,3.897808
2,447605,1,1807,0.92,43846,4.166118
3,447605,1,1608,0.07,57340,3.897808
4,447717,1,1609,0.92,63975,3.788316


In [114]:
click_train_landing_page_ads 

(5477347, 6)

In [109]:
doc_meta_categories_merged.head(10)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,category_id,confidence_level_cat
0,88851,12,2012-07-30 08:00:00,594,1608,0.07
1,36742,12,2009-06-30 00:00:00,594,1808,0.92
2,36742,12,2009-06-30 00:00:00,594,1513,0.07
3,36857,12,2009-07-15 00:00:00,594,1806,0.92
4,36857,12,2009-07-15 00:00:00,594,1808,0.07
5,36824,12,2011-08-01 00:00:00,594,1100,0.92
6,36824,12,2011-08-01 00:00:00,594,1510,0.07
7,36772,12,2009-01-14 00:00:00,594,1505,0.92
8,36772,12,2009-01-14 00:00:00,594,1809,0.07
9,36738,12,2006-11-15 00:00:00,594,2004,0.92


In [111]:
doc_meta_categories_merged.document_id_doc.max(), doc_meta_categories_merged.document_id_doc.min()

(2999334, 1)

# USE SPARSE MATRIX

In [25]:
doc_meta_categories_merged.head(2)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,category_id,confidence_level_cat
0,1430314,1,2016-05-23 00:00:00,603,1608,0.92
1,1430314,1,2016-05-23 00:00:00,603,1807,0.07


In [45]:
cols = doc_meta_categories_merged.category_id.values   
rows = doc_meta_categories_merged.document_id_doc.values
X = cupy.sparse.coo_matrix((cupy.ones(len(rows)), (rows, cols)))

In [46]:
# X should be a binary matrix
X.shape[0], len(cols), len(rows)

(2999335, 5477347, 5477347)

In [49]:
X.data, X.shape

(array([1., 1., 1., ..., 1., 1., 1.]), (2999335, 2101))

In [28]:
import cupy as cp
import numpy as np
#from numpy import bincount, log, log1p, sqrt
from cupy import bincount, log, log1p, sqrt

def tfidf_weight(X):
    """ Weights a Sparse Matrix by TF-IDF Weighted """
    #X = coo_matrix(X)

    # calculate IDF
    N = float(X.shape[0])
    idf = log(N) - log1p(bincount(X.col))

    # apply TF-IDF adjustment
    X.data = sqrt(X.data) * idf[X.col]
    return X

tf_idf_X = tfidf_weight(X)
                           
tf_idf_X.dot(tf_idf_X.T) / (tf_idf_X.data / sqrt(bincount(tf_idf_X.row, X.data ** 2))[tf_idf_X.row])               

In [29]:
%%time
cosine_similarity_nominator = tf_idf_X.dot(tf_idf_X.T)

CPU times: user 1min 3s, sys: 21 s, total: 1min 24s
Wall time: 1min 28s


In [34]:
doc_meta_categories_merged.head()

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,category_id,confidence_level_cat
0,696318,1,2015-05-31 10:00:00,603,1807,0.92
1,696318,1,2015-05-31 10:00:00,603,1608,0.07
2,635188,1,2015-04-26 00:00:00,603,1807,0.92
3,635188,1,2015-04-26 00:00:00,603,1608,0.07
4,579489,1,2015-04-01 00:00:00,603,1807,0.92


In [38]:
# another way to do the calculations above. #calculate cosine here.
N = doc_meta_categories_merged.document_id_doc.nunique()
rows = doc_meta_categories_merged.document_id_doc.values
cols = doc_meta_categories_merged.category_id.values
idf = cupy.log(N) - cupy.log1p(cupy.bincount(rows))
data = idf[cols]
norms = cupy.sqrt(cupy.bincount(rows, data ** 2))[rows]
normalized = data / norms

In [57]:
norms.shape

(5477347,)

In [44]:
rows.max()

array(2999334, dtype=int32)

In [43]:
cupy.bincount(rows)

array([0, 2, 2, ..., 2, 2, 1])

In [39]:
idf

array([14.85457947, 13.75596718, 13.75596718, ..., 13.75596718,
       13.75596718, 14.16143229])

In [40]:
cols

array([1807, 1608, 1807, ..., 2003, 1205, 2003])

In [31]:
data.shape, idf.shape

((5477347,), (2999335,))

In [28]:
len(rows), len(cols)

(5477347, 5477347)

We need to convert cudf df to pandas df to be able to do these ops. this takes long time.

In [None]:
documents_categories_df.head()

In [22]:
#Grouping category_ids and confidence_levels..
documents_categories_df = documents_categories_df.to_pandas()
documents_categories_grouped_df = documents_categories_df.groupby('document_id_cat', 
    as_index=False)['category_id', 'confidence_level_cat'].agg(lambda x: list(x)).rename(
    columns={'category_id': 'category_id_list', 'confidence_level_cat': 'confidence_level_cat_list'})

  after removing the cwd from sys.path.


In [29]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [23]:
#"Reading documents_topics...
documents_topics_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_topics.csv',
    names=['document_id_top', 'topic_id', 'confidence_level_top'],
    dtype={'document_id_top': 'int64','topic_id': 'int64','confidence_level_top': 'float32'},
    skiprows=1)

In [None]:
#"Grouping topic_ids and confidence_levels..."
documents_topics_df = documents_topics_df.to_pandas()
documents_topics_grouped_df = documents_topics_df.groupby('document_id_top', 
    as_index=False)['topic_id', 'confidence_level_top'].agg(lambda x: list(x)).rename(
    columns={'topic_id': 'topic_id_list', 'confidence_level_top': 'confidence_level_top_list'})

In [31]:
documents_topics_grouped_df.head()

Unnamed: 0,document_id_top,topic_id_list,confidence_level_top_list
0,1,"[252, 35, 150, 276, 137]","[0.020726216956973076, 0.012773293070495129, 0..."
1,2,[216],[0.013028857298195362]
2,3,"[276, 107, 176, 160, 260, 283, 202, 156, 104, ...","[0.029913833364844322, 0.027589114382863045, 0..."
3,4,"[102, 75, 16, 244, 97, 239]","[0.08417084068059921, 0.08329997211694717, 0.0..."
4,5,"[75, 102, 244, 143, 130, 202, 16, 44, 64]","[0.10655508935451508, 0.08646664023399353, 0.0..."


In [24]:
#Reading documents_entities..."
documents_entities_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_entities.csv',
    names=['document_id_ent', 'entity_id', 'confidence_level_ent'],
    dtype={'document_id_ent': 'int64','entity_id': 'str','confidence_level_ent': 'float32'},
    skiprows=1)

In [289]:
#Grouping entity_ids and confidence_levels...
documents_entities_df = documents_entities_df.to_pandas()
documents_entities_grouped_df = documents_entities_df.groupby('document_id_ent', 
    as_index=False)['entity_id', 'confidence_level_ent'].agg(lambda x: list(x)).rename(
    columns={'entity_id': 'entity_id_list', 'confidence_level_ent': 'confidence_level_ent_list'})

  after removing the cwd from sys.path.


In [33]:
documents_entities_grouped_df.head()

Unnamed: 0,document_id_ent,entity_id_list,confidence_level_ent_list
0,2,"[e88e4cde6c6482ff01d4ad424904ffd1, a10e6138ae7...","[0.6539117097854614, 0.5391356348991394]"
1,3,[7f77b6d24f62a1ad1b469fdf4bc0a7f4],[0.3209395706653595]
2,12,[f5a390c1e5d342825b82bcc637a72e7a],[0.7968870401382446]
3,23,[d89b0bbdcbdb66475028b32eb8098dd4],[0.25853851437568665]
4,43,"[015f6154b8850c35a261314b4eb88846, 94101adfc2f...","[0.8686161041259766, 0.36814022064208984, 0.26..."


In [34]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [30]:
documents_categories_grouped_df.dtypes

document_id_cat               int64
category_id_list             object
confidence_level_cat_list    object
dtype: object

In [32]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [36]:
documents_topics_grouped_df.head()

Unnamed: 0,document_id_top,topic_id_list,confidence_level_top_list
0,1,"[252, 35, 150, 276, 137]","[0.020726216956973076, 0.012773293070495129, 0..."
1,2,[216],[0.013028857298195362]
2,3,"[276, 107, 176, 160, 260, 283, 202, 156, 104, ...","[0.029913833364844322, 0.027589114382863045, 0..."
3,4,"[102, 75, 16, 244, 97, 239]","[0.08417084068059921, 0.08329997211694717, 0.0..."
4,5,"[75, 102, 244, 143, 130, 202, 16, 44, 64]","[0.10655508935451508, 0.08646664023399353, 0.0..."


In [37]:
documents_topics_grouped_df.dtypes

document_id_top               int64
topic_id_list                object
confidence_level_top_list    object
dtype: object

In [30]:
documents_categories_grouped_df = pd.read_pickle('./preprocessed/documents_categories_grouped_df.pkl')
documents_topics_grouped_df = pd.read_pickle('./preprocessed/documents_topics_grouped_df.pkl')
documents_entities_grouped_df = pd.read_pickle('./preprocessed/documents_entities_grouped_df.pkl')

In [31]:
documents_meta_pdf = documents_meta_df.to_pandas()

In [154]:
# ### Left join: each document has documents_meta, but some have no categories, topic or entity

documents_df1 = documents_meta_pdf.merge(documents_categories_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_cat')

documents_df2 = documents_df1.merge(documents_topics_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_top')

documents_df = documents_df2.merge(documents_entities_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_ent')

In [26]:
#documents_df.to_pickle('./preprocessed/documents_df.pkl')
documents_df= pd.read_pickle('./preprocessed/documents_df.pkl')

In [33]:
documents_df.head(2)

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id,document_id_cat,category_id_list,confidence_level_cat_list,document_id_top,topic_id_list,confidence_level_top_list,document_id_ent,entity_id_list,confidence_level_ent_list
0,178806,11,2013-08-21 10:00:00,3,178806.0,"[1515, 1609]","[0.9199999570846558, 0.07000000029802322]",178806.0,"[174, 117, 20, 143, 291, 286, 163, 285]","[0.10020265728235245, 0.06237642467021942, 0.0...",178806.0,"[30606a894b01a46e93434d9b33d0a7aa, 99ae30c61b0...","[0.5636787414550781, 0.3229350745677948]"
1,88615,11,2012-07-27 13:00:00,3,88615.0,"[1405, 1403]","[0.9199999570846558, 0.07000000029802322]",88615.0,"[250, 258, 252, 276, 176, 150, 219, 82]","[0.06101181358098984, 0.05922364443540573, 0.0...",88615.0,"[e0b84fb3101b2e4e62e5384757913d20, 65575b03fa5...","[0.8432426452636719, 0.34350281953811646, 0.32..."


In [27]:
evaluation = False
if evaluation:
    # TODO: no read need if whole script run 
    validation_set_df = pd.read_parquet(args.output_directory+"validation_set.parquet")

    users_to_profile = validation_set_df.loc[:, ['uuid_event']].drop_duplicates()

    validation_user_docs_to_ignore = validation_set_df.loc[:, ['uuid_event', 'document_id_promo']].drop_duplicates().rename(
        columns={'uuid_event': 'uuid_pv', 'document_id_promo': 'document_id_pv'})
else:
    events_df = cudf.read_csv(DATA_BUCKET_FOLDER+'events.csv', dtype=['int32','str','int32','int32','int32','str'], 
                                names=['display_id', 'uuid_event', 'document_id_event', 'timestamp_event', 'platform_event', 'geo_location_event'], header=0, na_values=['\\N', ''],
    keep_default_na=False,
    skiprows=1)
   
    events_df['event_country'] = events_df['geo_location_event'].str.slice(0,2)
    events_df['day_event'] = (events_df['timestamp_event'] / 1000 / 60 / 60 / 24).astype(int)
    # Drop rows with empty "geo_location"
    events_df = events_df.dropna(subset="geo_location_event")
   
    # Drop rows with empty "platform"
    events_df = events_df.dropna(subset="platform_event")
    
    promoted_content_df = cudf.read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv', dtype=['int32', 'int32','int32','int32'], 
                                names=['ad_id', 'document_id_promo', 'campaign_id', 'advertiser_id'], header=0)

    clicks_test_df = cudf.read_csv(DATA_BUCKET_FOLDER+'clicks_test.csv', dtype=['int32', 'int32'], 
                                names=['display_id', 'ad_id'], header=0)
    test_set_df1 = clicks_test_df.merge(promoted_content_df, on='ad_id', how='left')
    test_set_df = test_set_df1.merge(events_df, on='display_id', how='left')
    users_to_profile = (np.unique(test_set_df['uuid_event'])).to_frame()
    test_users_docs_timestamp_to_ignore = test_set_df[['uuid_event', 'document_id_promo', 'timestamp_event']].drop_duplicates().reset_index(drop=True)

In [60]:
users_to_profile

Unnamed: 0,uuid_event
0,
1,10000b6df44162
2,1001b1687f2e73
3,10020e2d1802e9
4,100275aabf71b3
...,...
199912,fffc132da6de56
199913,fffc99bdd2f476
199914,fffcc2b5da86ee
199915,fffe56ceff4ae7


In [29]:
events_df.head()

Unnamed: 0,display_id,uuid_event,document_id_event,timestamp_event,platform_event,geo_location_event,event_country,day_event
0,2,79a85fa78311b9,1794259,81,2,US>CA>807,US,0
1,3,822932ce3d8757,1179111,182,2,US>MI>505,US,0
2,4,85281d0a49f7ac,1777797,234,2,US>WV>564,US,0
3,5,8d0daef4bf5b56,252458,338,2,SG>00,SG,0
4,6,7765b4faae4ad4,1773517,395,3,US>OH>510,US,0


In [30]:
page_views_df = cudf.read_csv(DATA_BUCKET_FOLDER + "page_views.csv",
    names=['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv', 'geo_location_pv', 'traffic_source_pv'],
    dtype={'uuid_pv': 'str', 'document_id_pv': 'int64', 'timestamp_pv': 'int64',
           'platform_pv': 'int64', 'geo_location_pv': 'str', 'traffic_source_pv': 'int64'},
    skiprows=1,
    keep_default_na=False)

In [31]:
page_views_df.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2


In [32]:
# first apply (SELECT uuid_event FROM users_to_profile WHERE uuid_event = p.uuid_pv)
page_views_train_df1 = page_views_df.loc[page_views_df['uuid_pv'].isin(users_to_profile['uuid_event'])]
page_views_train_df1.shape

(43810, 6)

In [33]:
page_views_train_df1['id']=range(1,page_views_train_df1.shape[0]+1)
page_views_train_df1 = page_views_train_df1.set_index('id').reset_index()
page_views_train_df1.head()

Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1,7af903047336f7,5410,63553585,1,US>MI>505,2
1,2,5fe4adf60dce62,7012,42055742,1,CA>ON,1
2,3,eeb8c46c2eb234,7012,23428200,1,GB>H9,2
3,4,1df91eb130564c,8226,24067562,3,GB>L2,2
4,5,a0d65bd8d74d7e,10653,68132570,3,US>MN>613,2


In [34]:
test_users_docs_timestamp_to_ignore.head(2)

Unnamed: 0,uuid_event,document_id_promo,timestamp_event
0,,326418,
1,,1578222,


In [35]:
# apply SELECT uuid_event FROM test_users_docs_timestamp_to_ignore  WHERE uuid_event = p.uuid_pv 
# AND document_id_promo = p.document_id_pv
page_views_joined_df1 = page_views_train_df1.merge(test_users_docs_timestamp_to_ignore, how='inner', left_on=['uuid_pv','document_id_pv'], right_on = ['uuid_event','document_id_promo'])
print(page_views_joined_df1.shape)

(724, 10)


In [33]:
page_views_joined_df1.head(2)

Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,uuid_event,document_id_promo,timestamp_event
0,41013,133f2268c25151,1444718,38578356,2,US>TX>623,1,133f2268c25151,1444718,38366163
1,26936,1ce293a13af0c1,1649400,53000625,1,US>VA>573,1,1ce293a13af0c1,1649400,52915025


In [36]:
page_views_joined_df1= page_views_joined_df1[page_views_joined_df1.timestamp_pv >= page_views_joined_df1.timestamp_event]
print(page_views_joined_df1.shape)
page_views_joined_df1.head()

(723, 10)


Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,uuid_event,document_id_promo,timestamp_event
0,2211,19dc1285ba1a57,1552983,16537403,2,US>CA>825,1,19dc1285ba1a57,1552983,16375941
1,41013,133f2268c25151,1444718,38578356,2,US>TX>623,1,133f2268c25151,1444718,38366163
2,2214,18cd9d691ea549,1552983,21150693,2,US>MO>609,1,18cd9d691ea549,1552983,21033228
3,30544,1262fe328e55e0,1362397,52046684,1,US>NY>501,1,1262fe328e55e0,1362397,51948398
4,26936,1ce293a13af0c1,1649400,53000625,1,US>VA>573,1,1ce293a13af0c1,1649400,52915025


In [38]:
page_views_train_df2 = page_views_train_df1.loc[~page_views_train_df1['id'].isin(page_views_joined_df1['id'])]
print(page_views_train_df2.shape)
page_views_train_df2.head()

(43087, 7)


Unnamed: 0,id,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1,7af903047336f7,5410,63553585,1,US>MI>505,2
1,2,5fe4adf60dce62,7012,42055742,1,CA>ON,1
2,3,eeb8c46c2eb234,7012,23428200,1,GB>H9,2
3,4,1df91eb130564c,8226,24067562,3,GB>L2,2
4,5,a0d65bd8d74d7e,10653,68132570,3,US>MN>613,2


In [39]:
#convert to pandas
page_views_train_df2 = page_views_train_df2[page_views_df.columns]
page_views_train_df2 = page_views_train_df2.to_pandas()
page_views_train_df2.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,7af903047336f7,5410,63553585,1,US>MI>505,2
1,5fe4adf60dce62,7012,42055742,1,CA>ON,1


In [40]:
page_views_train_df = page_views_train_df2.merge(documents_df,left_on='document_id_pv', right_on='document_id_doc', how='left') 
page_views_train_df.shape

(43087, 19)

In [41]:
page_views_train_df.columns

Index(['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv',
       'geo_location_pv', 'traffic_source_pv', 'document_id_doc', 'source_id',
       'publish_time', 'publisher_id', 'document_id_cat', 'category_id_list',
       'confidence_level_cat_list', 'document_id_top', 'topic_id_list',
       'confidence_level_top_list', 'document_id_ent', 'entity_id_list',
       'confidence_level_ent_list'],
      dtype='object')

In [39]:
page_views_train_df.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,document_id_doc,source_id,publish_time,publisher_id,document_id_cat,category_id_list,confidence_level_cat_list,document_id_top,topic_id_list,confidence_level_top_list,document_id_ent,entity_id_list,confidence_level_ent_list
0,7af903047336f7,5410,63553585,1,US>MI>505,2,5410,1627,2010-05-09 10:00:00,714,5410.0,"[1909, 1903]","[0.9199999570846558, 0.07000000029802322]",5410.0,"[184, 129, 176, 194, 140, 153, 270, 41, 20, 26...","[0.08178263902664185, 0.07670240849256516, 0.0...",,,
1,5fe4adf60dce62,7012,42055742,1,CA>ON,1,7012,438,2010-08-17 17:00:00,46,7012.0,"[1505, 1405]","[0.9199999570846558, 0.07000000029802322]",7012.0,"[143, 131, 138, 20, 75, 1]","[0.09146464616060257, 0.08204874396324158, 0.0...",7012.0,[e1658ab792d23efb4f73ef23a879849b],[0.2498592883348465]


In [None]:
# import pickle
# with open(OUTPUT_BUCKET_FOLDER+'entities_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
#     entities_docs_counts = pickle.load(input_file)
# entities_docs_counts

In [43]:
print('Processing document frequencies...')

#both dfs here are cudf dataframes
documents_total = documents_meta_df.shape[0]

categories_docs_counts = documents_categories_df.groupby('category_id').size().rename('count').reset_index()
categories_docs_counts = dict(categories_docs_counts.values.tolist())
len(categories_docs_counts)

df_filenames_suffix = ''
if evaluation:
    df_filenames_suffix = '_eval'

# with open(OUTPUT_BUCKET_FOLDER + 'categories_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
#     pickle.dump(categories_docs_counts, output)

topics_docs_counts = documents_topics_df.groupby('topic_id').size().rename('count').reset_index()
topics_docs_counts = dict(topics_docs_counts.values.tolist())
len(topics_docs_counts)

# with open(OUTPUT_BUCKET_FOLDER + 'topics_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
#     pickle.dump(topics_docs_counts, output)

Processing document frequencies...


300

# NOTE TO DAWID

In [46]:
entities_docs_counts = documents_entities_df.groupby('entity_id').size().rename('count').reset_index()

# #the line below gives error.
# entities_docs_counts = dict(entities_docs_counts.values.tolist())
# len(entities_docs_counts)

# with open(OUTPUT_BUCKET_FOLDER + 'entities_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
#     pickle.dump(entities_docs_counts, output)

In [48]:
print('Processing user profiles...')

null_to_empty_list = lambda x: x if isinstance(x, list) else []
null_to_empty_dict = lambda x: x if isinstance(x, dict) else {}
null_to_empty_string = lambda x: x if isinstance(x, str) else ''
null_to_minus_one = lambda x: -1 if np.isnan(x) else x
null_to_zero = lambda x: 0 if np.isnan(x) else x

Processing user profiles...


In [49]:
page_views_train_df.fillna({'timestamp_pv': -1}, inplace=True)

page_views_train_df.loc[:, 'category_id_list']=  page_views_train_df.loc[:, 'category_id_list'].apply(null_to_empty_list) 


page_views_train_df.loc[:, 'confidence_level_cat_list'] = page_views_train_df.loc[:, 
    'confidence_level_cat_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'topic_id_list'] = page_views_train_df.loc[:, 'topic_id_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'confidence_level_top_list'] = page_views_train_df.loc[:, 
    'confidence_level_top_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'entity_id_list'] = page_views_train_df.loc[:, 'entity_id_list'].apply(null_to_empty_string)
page_views_train_df.loc[:, 'confidence_level_ent_list'] = page_views_train_df.loc[:, 
    'confidence_level_ent_list'].apply(null_to_empty_list)

page_views_by_user_df = page_views_train_df.groupby('uuid_pv', as_index=False)[
    'document_id_pv', 'timestamp_pv', 'category_id_list', 'confidence_level_cat_list', 'topic_id_list',
    'confidence_level_top_list', 'entity_id_list', 'confidence_level_ent_list'].agg(lambda x: list(x)).rename(
    columns={'document_id_pv': 'document_id_pv_list', 'timestamp_pv': 'timestamp_pv_list', 'category_id_list': 
    'category_id_lists', 'confidence_level_cat_list': 'cat_confidence_level_lists', 'topic_id_list': 
    'topic_id_lists', 'confidence_level_top_list': 'top_confidence_level_lists', 'entity_id_list': 
    'entity_id_lists', 'confidence_level_ent_list': 'ent_confidence_level_lists'})



In [188]:
page_views_train_df.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,document_id_doc,source_id,publish_time,publisher_id,document_id_cat,category_id_list,confidence_level_cat_list,document_id_top,topic_id_list,confidence_level_top_list,document_id_ent,entity_id_list,confidence_level_ent_list
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
2,8007cf507f879f,1858440,65881386,1,US>AL>630,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
3,c323a4acee49ea,1859539,81808273,2,US,1,1859539,6698,2016-06-14 16:00:00,784,1859539.0,"[1607, 1610]","[0.4920482039451599, 0.037438444793224335]",1859539.0,[43],[0.010142072103917599],,,[]
4,868e63ea5adfc9,1859942,58142438,2,US>IN>527,2,1859942,12706,2016-06-14 16:00:00,1044,1859942.0,"[1306, 1210]","[0.6837242245674133, 0.05202249437570572]",1859942.0,"[196, 10]","[0.17931514978408813, 0.00934587512165308]",,,[]


In [194]:
page_views_by_user_df.columns

Index(['uuid_pv', 'document_id_pv_list', 'timestamp_pv_list',
       'category_id_lists', 'cat_confidence_level_lists', 'topic_id_lists',
       'top_confidence_level_lists', 'entity_id_lists',
       'ent_confidence_level_lists'],
      dtype='object')

In [56]:
def get_user_aspects(docs_aspects, aspect_docs_counts):
    docs_aspects_merged_lists = defaultdict(list)
  
    for doc_aspects in docs_aspects:
        for key in doc_aspects.keys():
            docs_aspects_merged_lists[key].append(doc_aspects[key])
      
    docs_aspects_stats = {}
    
    for key in docs_aspects_merged_lists.keys():
        print("key:", key)
        aspect_list = docs_aspects_merged_lists[key]
        print("aspect_list:", aspect_list)
        tf = len(aspect_list)
        idf = math.log(documents_total / float(aspect_docs_counts[key]))
    
        confid_mean = sum(aspect_list) / float(len(aspect_list))
        print([tf*idf, confid_mean])
        docs_aspects_stats[key] = [tf*idf, confid_mean]
      
    return docs_aspects_stats


def generate_user_profile(docs_aspects_list, docs_aspects_confidence_list, aspect_docs_counts = categories_docs_counts):
    docs_aspects = []
    for doc_aspects_list, doc_aspects_confidence_list in zip(docs_aspects_list, docs_aspects_confidence_list):
        doc_aspects = dict(zip(doc_aspects_list, doc_aspects_confidence_list))
        docs_aspects.append(doc_aspects)
    
    user_aspects = get_user_aspects(docs_aspects, aspect_docs_counts)
    print("user_aspects", user_aspects)
    return user_aspects

In [32]:
get_list_len_udf = lambda docs_list: len(docs_list)

generate_categories_user_profile_map_udf = lambda row: \
    generate_user_profile(row['category_id_lists'], row['cat_confidence_level_lists'], categories_docs_counts)


generate_topics_user_profile_map_udf = lambda row: \
    generate_user_profile(row['topic_id_lists'], row['top_confidence_level_lists'], topics_docs_counts)


generate_entities_user_profile_map_udf = lambda row: \
    generate_user_profile(row['entity_id_lists'], row['ent_confidence_level_lists'], entities_docs_counts)

In [52]:
page_views_by_user_df.head()

Unnamed: 0,uuid_pv,document_id_pv_list,timestamp_pv_list,category_id_lists,cat_confidence_level_lists,topic_id_lists,top_confidence_level_lists,entity_id_lists,ent_confidence_level_lists
0,100614d664a6c3,[1832823],[49445656],"[[1403, 1407]]","[[0.7033445835113525, 0.053515348583459854]]","[[16, 10, 214, 241, 145, 49]]","[[0.055320367217063904, 0.02467547170817852, 0...",[],[[]]
1,100ce200bea586,[1584920],[37854038],"[[1403, 1702]]","[[0.43590372800827026, 0.033166587352752686]]","[[16, 156, 181, 268, 120]]","[[0.035918738692998886, 0.011966102756559849, ...",[],[[]]
2,100d0ddfadd845,[1811567],[38184529],[[]],[[]],[[62]],[[0.015740877017378807]],[],[[]]
3,100fd99bcbcb85,[874215],[64232539],"[[1708, 1702]]","[[0.7546594738960266, 0.05741973966360092]]","[[36, 13, 26, 214, 241]]","[[0.0373980849981308, 0.028806278482079506, 0....",[],[[]]
4,100fdd39838d25,[343574],[62641621],"[[1513, 1808]]","[[0.9199999570846558, 0.07000000029802322]]","[[173, 207, 25, 181]]","[[0.13058622181415558, 0.0698360949754715, 0.0...",[],[[]]


In [53]:
page_views_by_user_df=page_views_by_user_df.iloc[:100, :]

In [54]:
page_views_by_user_df.head()

Unnamed: 0,uuid_pv,document_id_pv_list,timestamp_pv_list,category_id_lists,cat_confidence_level_lists,topic_id_lists,top_confidence_level_lists,entity_id_lists,ent_confidence_level_lists
0,100614d664a6c3,[1832823],[49445656],"[[1403, 1407]]","[[0.7033445835113525, 0.053515348583459854]]","[[16, 10, 214, 241, 145, 49]]","[[0.055320367217063904, 0.02467547170817852, 0...",[],[[]]
1,100ce200bea586,[1584920],[37854038],"[[1403, 1702]]","[[0.43590372800827026, 0.033166587352752686]]","[[16, 156, 181, 268, 120]]","[[0.035918738692998886, 0.011966102756559849, ...",[],[[]]
2,100d0ddfadd845,[1811567],[38184529],[[]],[[]],[[62]],[[0.015740877017378807]],[],[[]]
3,100fd99bcbcb85,[874215],[64232539],"[[1708, 1702]]","[[0.7546594738960266, 0.05741973966360092]]","[[36, 13, 26, 214, 241]]","[[0.0373980849981308, 0.028806278482079506, 0....",[],[[]]
4,100fdd39838d25,[343574],[62641621],"[[1513, 1808]]","[[0.9199999570846558, 0.07000000029802322]]","[[173, 207, 25, 181]]","[[0.13058622181415558, 0.0698360949754715, 0.0...",[],[[]]


In [55]:
page_views_by_user_df.shape

(100, 9)

In [57]:
#user_profiles_pdf['views'] = page_views_by_user_pdf.loc[:, 'document_id_pv_list'].apply(list_len_udf)
from collections import defaultdict

users_profile_df = pd.DataFrame()
users_profile_df.loc[:, 'uuid'] = page_views_by_user_df.loc[:, 'uuid_pv']
users_profile_df.loc[:, 'doc_ids'] = page_views_by_user_df.loc[:, 'document_id_pv_list']
users_profile_df.loc[:, 'views'] = page_views_by_user_df.loc[:, 'document_id_pv_list'].apply(get_list_len_udf)
users_profile_df.loc[:, 'categories'] = page_views_by_user_df.apply(
    generate_categories_user_profile_map_udf, axis=1)

key: 1403
aspect_list: [0.7033445835113525]
[1.6559796338810973, 0.7033445835113525]
key: 1407
aspect_list: [0.053515348583459854]
[3.1787294419957663, 0.053515348583459854]
user_aspects {1403: [1.6559796338810973, 0.7033445835113525], 1407: [3.1787294419957663, 0.053515348583459854]}
key: 1403
aspect_list: [0.43590372800827026]
[1.6559796338810973, 0.43590372800827026]
key: 1702
aspect_list: [0.033166587352752686]
[1.9928162043872233, 0.033166587352752686]
user_aspects {1403: [1.6559796338810973, 0.43590372800827026], 1702: [1.9928162043872233, 0.033166587352752686]}
user_aspects {}
key: 1708
aspect_list: [0.7546594738960266]
[3.043104604130009, 0.7546594738960266]
key: 1702
aspect_list: [0.05741973966360092]
[1.9928162043872233, 0.05741973966360092]
user_aspects {1708: [3.043104604130009, 0.7546594738960266], 1702: [1.9928162043872233, 0.05741973966360092]}
key: 1513
aspect_list: [0.9199999570846558]
[2.38416956809324, 0.9199999570846558]
key: 1808
aspect_list: [0.07000000029802322]


In [None]:
users_profile_df.loc[:, 'topics'] = page_views_by_user_df.apply(
    generate_topics_user_profile_map_udf, axis=1)
users_profile_df.loc[:, 'entities'] = page_views_by_user_df.apply(
    generate_entities_user_profile_map_udf, axis=1)

users_profile_df.shape

In [334]:
users_profile_df.head()

Unnamed: 0,uuid,doc_ids,views,categories,topics,entities
0,100614d664a6c3,[1832823],1,"{1403: [1.6559796338810973, 0.7033445835113525...","{16: [4.737792074229773, 0.055320367217063904]...",{}
1,100ce200bea586,[1584920],1,"{1403: [1.6559796338810973, 0.4359037280082702...","{16: [4.737792074229773, 0.035918738692998886]...",{}
2,100d0ddfadd845,[1811567],1,{},"{62: [6.114908231578814, 0.015740877017378807]}",{}
3,100fd99bcbcb85,[874215],1,"{1708: [3.043104604130009, 0.7546594738960266]...","{36: [6.007345370891915, 0.0373980849981308], ...",{}
4,100fdd39838d25,[343574],1,"{1513: [2.38416956809324, 0.9199999570846558],...","{173: [7.196600149509651, 0.13058622181415558]...",{}


In [336]:
# ## Write user profiles to pickle

if evaluation:
    table_name = 'user_profiles_eval.pickle'
else:
    table_name = 'user_profiles.pickle'

users_profile_df.to_pickle(OUTPUT_BUCKET_FOLDER+table_name)

In [53]:
user_profiles=pd.read_pickle('./preprocessed/user_profiles.pickle')

In [55]:
user_profiles

Unnamed: 0,uuid,doc_ids,views,categories,topics,entities
0,100614d664a6c3,[1832823],1,"{1403: [1.6559796338810973, 0.7033445835113525...","{16: [4.737792074229773, 0.055320367217063904]...",{}
1,100ce200bea586,[1584920],1,"{1403: [1.6559796338810973, 0.4359037280082702...","{16: [4.737792074229773, 0.035918738692998886]...",{}
2,100d0ddfadd845,[1811567],1,{},"{62: [6.114908231578814, 0.015740877017378807]}",{}
3,100fd99bcbcb85,[874215],1,"{1708: [3.043104604130009, 0.7546594738960266]...","{36: [6.007345370891915, 0.0373980849981308], ...",{}
4,100fdd39838d25,[343574],1,"{1513: [2.38416956809324, 0.9199999570846558],...","{173: [7.196600149509651, 0.13058622181415558]...",{}
...,...,...,...,...,...,...
35210,fff546b574c326,[1560922],1,"{1513: [2.38416956809324, 0.8295187950134277],...","{173: [7.196600149509651, 0.16267725825309753]...",{}
35211,fff85847e13ad7,"[1423145, 1588612, 1489758]",3,"{1610: [8.2191493241054, 0.25325982645154], 14...","{16: [9.475584148459546, 0.047547511756420135]...",{}
35212,fffaec7faa854b,[905581],1,"{1805: [4.308954201751231, 0.43258336186408997...","{160: [5.427288227686959, 0.13740678131580353]}",{}
35213,fffbef22e3c3c2,"[364356, 1472119]",2,"{1514: [3.9087801551570514, 0.6610000133514404...","{62: [6.114908231578814, 0.053441524505615234]...",{}


In [None]:
# we can merge doc_category_idf, doc_topics_idf and doc_entities_idf to the users_profile.
users_profile1 = users_profile.merge(doc_category_idf, how='inner', left_on='doc_ids', right_on='document_id_cat').drop(columns=['document_id_cat', 'category_id', 'count'])
users_profile2 = users_profile1.merge(doc_topics_idf, how='inner', left_on='doc_ids', right_on='document_id_top').drop(columns=['document_id_top', 'topic_id', 'count'])
users_profile = users_profile2.merge(doc_entities_idf, how='inner', left_on='doc_ids', right_on='document_id_ent').drop(columns=['document_id_ent', 'entity_id', 'count'])