<b>Author: Ronay Ak</b>

In [2]:
import cudf
import pandas as pd
import cupy 
import math
import time
from datetime import datetime
import numpy as np
import pickle

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [4]:
OUTPUT_BUCKET_FOLDER = "./preprocessed/"
DATA_BUCKET_FOLDER = "./dataset/"
#SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"

Here some csv files are subsets of the originals due to large dataset size.

In [5]:
evaluation = False

Start loading csv files..

In [33]:
documents_meta_df = cudf.read_csv(DATA_BUCKET_FOLDER+ 'documents_meta.csv', dtype=['int32','int32','int32','str'], 
                                names=['document_id_doc', 'source_id', 'publisher_id', 'publish_time'], header=0)
print(documents_meta_df.shape)
documents_meta_df.dtypes

(2999334, 4)


document_id_doc     int32
source_id           int32
publisher_id        int32
publish_time       object
dtype: object

In [41]:
# APPLY DROPNA
print('Drop rows with empty "source_id"...')
documents_meta_df = documents_meta_df.dropna(subset=['source_id'])
print(documents_meta_df.shape)

source_publishers_df = documents_meta_df[["source_id", "publisher_id"]].drop_duplicates()
source_publishers_df.shape

Drop rows with empty "source_id"...
(2996816, 4)


(14394, 2)

In [42]:
# the index numbers do not match with spark.
source_publishers_df.head()

Unnamed: 0,source_id,publisher_id
0,1,603
12867,2,85
12874,3,740
29583,4,213
29597,5,184


In [47]:
source_publishers_df = source_publishers_df.reset_index(drop=True)

In [54]:
documents_meta_pdf=documents_meta_df.to_pandas()

In [61]:
documents_meta_df.groupby(['source_id', 'publisher_id'])['document_id_doc'].sum()

source_id  publisher_id
1          603              6120790718
2          85                  9255235
3          740             14600478744
4          213                21133506
5          184               280689476
                              ...     
14400      716               893539902
14401      716                61574768
14402      716               300266786
14403      699              1488433179
14404      716                 7544114
Name: document_id_doc, Length: 9336, dtype: int64

In [62]:
#documents_meta_df_grouped.equals(source_publishers_df)

In [20]:
# get list of source_ids without publisher_id
source_ids_without_publisher_df = source_publishers_df.loc[source_publishers_df.loc[:, 'publisher_id'].isnull()]
new_publishers_df = source_ids_without_publisher_df.loc[:, ['source_id']]
new_publishers_df.shape

(5058, 1)

In [21]:
new_publishers_df.head()

Unnamed: 0,source_id
29846,7
29853,8
30995,16
33712,20
36964,26


In [14]:
source_ids_without_publisher_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,
29853,8,
30995,16,
33712,20,
36964,26,


In [24]:
print('Maximum value of publisher_id used so far...')
max_pub =source_publishers_df.dropna(subset='publisher_id')['publisher_id'].max()
max_pub

Maximum value of publisher_id used so far...


1263

NOTE: "source_id" column of new_publishers_df generated by spark is not in order. Here, with cudf it is in order.

In [25]:
# maximum value of publisher_id used so far
new_publisher_cnt = source_ids_without_publisher_df.shape[0]
# rows filled with new publisher_ids
new_publishers_df['publisher_id'] = np.arange(max_pub + 1, max_pub + 1 + new_publisher_cnt)
new_publishers_df.shape

(5058, 2)

In [26]:
new_publishers_df.head()

Unnamed: 0,source_id,publisher_id
29846,7,1264
29853,8,1265
30995,16,1266
33712,20,1267
36964,26,1268


In [27]:
new_publishers_df.source_id.max(), new_publishers_df.source_id.min()

(14366, 7)

In [28]:
new_publishers_df.dtypes

source_id       int32
publisher_id    int64
dtype: object

In [31]:
# old and new publishers merged
fixed_source_publishers_df = source_publishers_df.dropna(subset='publisher_id')
fixed_source_publishers_df = cudf.concat([fixed_source_publishers_df, new_publishers_df])

In [32]:
fixed_source_publishers_df.shape, documents_meta_df.shape

((14394, 2), (2996816, 4))

In [33]:
fixed_source_publishers_df.tail()

Unnamed: 0,source_id,publisher_id
2990886,14354,6317
2990902,14355,6318
2990906,14356,6319
2991236,14365,6320
2991239,14366,6321


In [34]:
print('Update documents_meta with new publishers...')

documents_meta_df = documents_meta_df.drop(columns='publisher_id').merge(fixed_source_publishers_df, on='source_id')
documents_meta_df.head()

Update documents_meta with new publishers...


Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
0,178806,11,2013-08-21 10:00:00,3
1,88615,11,2012-07-27 13:00:00,3
2,140917,11,2012-12-19 09:00:00,3
3,157526,11,2013-05-06 16:00:00,3
4,174117,11,2013-08-01 15:00:00,3


In [35]:
#publisher id is not same in spark dataframe.
documents_meta_df[documents_meta_df['source_id'] == 26]

Unnamed: 0,document_id_doc,source_id,publish_time,publisher_id
79972,2287601,26,2015-10-16 04:00:00,1268


In [36]:
documents_categories_df =  cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_categories.csv',
    names=['document_id_cat', 'category_id', 'confidence_level_cat'],
    dtype=['int64', 'int64','float32'], header=0)

We need to convert cudf df to panda df to be able to do these ops. this takes long time.

In [51]:
#Grouping category_ids and confidence_levels..
documents_categories_df = documents_categories_df.to_pandas()
documents_categories_grouped_df = documents_categories_df.groupby('document_id_cat', 
    as_index=False)['category_id', 'confidence_level_cat'].agg(lambda x: list(x)).rename(
    columns={'category_id': 'category_id_list', 'confidence_level_cat': 'confidence_level_cat_list'})

  after removing the cwd from sys.path.


In [29]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [284]:
#"Reading documents_topics...
documents_topics_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_topics.csv',
    names=['document_id_top', 'topic_id', 'confidence_level_top'],
    dtype={'document_id_top': 'int64','topic_id': 'int64','confidence_level_top': 'float32'},
    skiprows=1)

In [285]:
#"Grouping topic_ids and confidence_levels..."
documents_topics_df = documents_topics_df.to_pandas()
documents_topics_grouped_df = documents_topics_df.groupby('document_id_top', 
    as_index=False)['topic_id', 'confidence_level_top'].agg(lambda x: list(x)).rename(
    columns={'topic_id': 'topic_id_list', 'confidence_level_top': 'confidence_level_top_list'})

  after removing the cwd from sys.path.


In [31]:
documents_topics_grouped_df.head()

Unnamed: 0,document_id_top,topic_id_list,confidence_level_top_list
0,1,"[252, 35, 150, 276, 137]","[0.020726216956973076, 0.012773293070495129, 0..."
1,2,[216],[0.013028857298195362]
2,3,"[276, 107, 176, 160, 260, 283, 202, 156, 104, ...","[0.029913833364844322, 0.027589114382863045, 0..."
3,4,"[102, 75, 16, 244, 97, 239]","[0.08417084068059921, 0.08329997211694717, 0.0..."
4,5,"[75, 102, 244, 143, 130, 202, 16, 44, 64]","[0.10655508935451508, 0.08646664023399353, 0.0..."


In [288]:
#Reading documents_entities..."
documents_entities_df = cudf.read_csv(
    DATA_BUCKET_FOLDER + 'documents_entities.csv',
    names=['document_id_ent', 'entity_id', 'confidence_level_ent'],
    dtype={'document_id_ent': 'int64','entity_id': 'str','confidence_level_ent': 'float32'},
    skiprows=1)

In [289]:
#Grouping entity_ids and confidence_levels...
documents_entities_df = documents_entities_df.to_pandas()
documents_entities_grouped_df = documents_entities_df.groupby('document_id_ent', 
    as_index=False)['entity_id', 'confidence_level_ent'].agg(lambda x: list(x)).rename(
    columns={'entity_id': 'entity_id_list', 'confidence_level_ent': 'confidence_level_ent_list'})

  after removing the cwd from sys.path.


In [33]:
documents_entities_grouped_df.head()

Unnamed: 0,document_id_ent,entity_id_list,confidence_level_ent_list
0,2,"[e88e4cde6c6482ff01d4ad424904ffd1, a10e6138ae7...","[0.6539117097854614, 0.5391356348991394]"
1,3,[7f77b6d24f62a1ad1b469fdf4bc0a7f4],[0.3209395706653595]
2,12,[f5a390c1e5d342825b82bcc637a72e7a],[0.7968870401382446]
3,23,[d89b0bbdcbdb66475028b32eb8098dd4],[0.25853851437568665]
4,43,"[015f6154b8850c35a261314b4eb88846, 94101adfc2f...","[0.8686161041259766, 0.36814022064208984, 0.26..."


In [34]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [27]:
documents_meta_df=documents_meta_df.to_pandas()

In [35]:
documents_categories_grouped_df = pd.read_pickle('documents_categories_grouped_df.pkl')
documents_topics_grouped_df = pd.read_pickle('documents_topics_grouped_df.pkl')
# documents_entities_grouped_df.to_pickle('documents_entities_grouped_df.pkl')

In [30]:
documents_categories_grouped_df.dtypes

document_id_cat               int64
category_id_list             object
confidence_level_cat_list    object
dtype: object

In [32]:
documents_categories_grouped_df.head()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list
0,1,"[1706, 1603]","[0.47122037410736084, 0.03585372492671013]"
1,2,"[1705, 1708]","[0.6000000238418579, 0.4000000059604645]"
2,3,"[2003, 1403]","[0.5228370428085327, 0.039781082421541214]"
3,4,"[2003, 2006]","[0.9199999570846558, 0.07000000029802322]"
4,5,"[2006, 2003]","[0.9199999570846558, 0.07000000029802322]"


In [36]:
documents_topics_grouped_df.head()

Unnamed: 0,document_id_top,topic_id_list,confidence_level_top_list
0,1,"[252, 35, 150, 276, 137]","[0.020726216956973076, 0.012773293070495129, 0..."
1,2,[216],[0.013028857298195362]
2,3,"[276, 107, 176, 160, 260, 283, 202, 156, 104, ...","[0.029913833364844322, 0.027589114382863045, 0..."
3,4,"[102, 75, 16, 244, 97, 239]","[0.08417084068059921, 0.08329997211694717, 0.0..."
4,5,"[75, 102, 244, 143, 130, 202, 16, 44, 64]","[0.10655508935451508, 0.08646664023399353, 0.0..."


In [37]:
documents_topics_grouped_df.dtypes

document_id_top               int64
topic_id_list                object
confidence_level_top_list    object
dtype: object

In [150]:
documents_categories_grouped_df = pd.read_pickle('./preprocessed/documents_categories_grouped_df.pkl')
documents_topics_grouped_df = pd.read_pickle('./preprocessed/documents_topics_grouped_df.pkl')
documents_entities_grouped_df = pd.read_pickle('./preprocessed/documents_entities_grouped_df.pkl')

In [153]:
documents_meta_pdf = documents_meta_df.to_pandas()

In [154]:
# ### Left join: each document has documents_meta, but some have no categories, topic or entity

documents_df1 = documents_meta_pdf.merge(documents_categories_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_cat')

documents_df2 = documents_df1.merge(documents_topics_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_top')

documents_df = documents_df2.merge(documents_entities_grouped_df, how='left', left_on='document_id_doc', right_on='document_id_ent')

In [155]:
documents_df.to_pickle('./preprocessed/documents_df.pkl')

In [37]:
evaluation = False
if evaluation:
    # TODO: no read need if whole script run 
    validation_set_df = pd.read_parquet(args.output_directory+"validation_set.parquet")

    users_to_profile = validation_set_df.loc[:, ['uuid_event']].drop_duplicates()

    validation_user_docs_to_ignore = validation_set_df.loc[:, ['uuid_event', 'document_id_promo']].drop_duplicates().rename(
        columns={'uuid_event': 'uuid_pv', 'document_id_promo': 'document_id_pv'})
else:
    events_df = cudf.read_csv(DATA_BUCKET_FOLDER+'events.csv', dtype=['int32','str','int32','int32','int32','str'], 
                                names=['display_id', 'uuid_event', 'document_id_event', 'timestamp_event', 'platform_event', 'geo_location_event'], header=0, na_values=['\\N', ''],
    keep_default_na=False,
    skiprows=1)
   
    events_df['event_country'] = events_df['geo_location_event'].str.slice(0,2)
    events_df['day_event'] = (events_df['timestamp_event'] / 1000 / 60 / 60 / 24).astype(int)
    # Drop rows with empty "geo_location"
    events_df = events_df.dropna(subset="geo_location_event")
   
    # Drop rows with empty "platform"
    events_df = events_df.dropna(subset="platform_event")
    
    promoted_content_df = cudf.read_csv(DATA_BUCKET_FOLDER+'promoted_content.csv', dtype=['int32', 'int32','int32','int32'], 
                                names=['ad_id', 'document_id_promo', 'campaign_id', 'advertiser_id'], header=0)

    clicks_test_df = cudf.read_csv(DATA_BUCKET_FOLDER+'clicks_test.csv', dtype=['int32', 'int32'], 
                                names=['display_id', 'ad_id'], header=0)
    test_set_df1 = clicks_test_df.merge(promoted_content_df, on='ad_id', how='left')
    test_set_df = test_set_df1.merge(events_df, on='display_id', how='left')
    users_to_profile = (np.unique(test_set_df['uuid_event'])).to_frame()
    test_users_docs_timestamp_to_ignore = test_set_df[['uuid_event', 'document_id_promo', 'timestamp_event']].drop_duplicates().reset_index(drop=True)

In [38]:
page_views_df = cudf.read_csv(DATA_BUCKET_FOLDER + "page_views.csv",
    names=['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv', 'geo_location_pv', 'traffic_source_pv'],
    dtype={'uuid_pv': 'str', 'document_id_pv': 'int64', 'timestamp_pv': 'int64',
           'platform_pv': 'int64', 'geo_location_pv': 'str', 'traffic_source_pv': 'int64'},
    skiprows=1,
    keep_default_na=False)

In [95]:
page_views_df.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1
3,8205775c5387f9,120,44196592,1,IN>16,2
4,9cb0ccd8458371,120,65817371,1,US>CA>807,2


In [65]:
# first apply (SELECT uuid_event FROM users_to_profile WHERE uuid_event = p.uuid_pv
page_views_train_df1 = page_views_df.loc[page_views_df['uuid_pv'].isin(users_to_profile['uuid_event'])]

In [66]:
page_views_train_df1.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
273,7af903047336f7,5410,63553585,1,US>MI>505,2
290,5fe4adf60dce62,7012,42055742,1,CA>ON,1
440,eeb8c46c2eb234,7012,23428200,1,GB>H9,2
1241,1df91eb130564c,8226,24067562,3,GB>L2,2
1523,a0d65bd8d74d7e,10653,68132570,3,US>MN>613,2


In [87]:
test_users_docs_timestamp_to_ignore.head(2)

Unnamed: 0,uuid_event,document_id_promo,timestamp_event
0,,326418,
1,,1578222,


In [70]:
# apply SELECT uuid_event FROM test_users_docs_timestamp_to_ignore  WHERE uuid_event = p.uuid_pv 
# AND document_id_promo = p.document_id_pv
page_views_joined_df1 = page_views_train_df1.merge(test_users_docs_timestamp_to_ignore, how='left', left_on=['uuid_pv','document_id_pv'], right_on = ['uuid_event','document_id_promo'])
print(page_views_joined_df1.shape)

(43810, 9)


In [71]:
page_views_joined_df1.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,uuid_event,document_id_promo,timestamp_event
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1,,,
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1,,,
2,8007cf507f879f,1858440,65881386,1,US>AL>630,1,,,
3,c323a4acee49ea,1859539,81808273,2,US,1,,,
4,868e63ea5adfc9,1859942,58142438,2,US>IN>527,2,,,


In [128]:
page_views_joined_df1['time_condition']= page_views_joined_df1.timestamp_pv >= page_views_joined_df1.timestamp_event

In [129]:
page_views_joined_df1.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,uuid_event,document_id_promo,timestamp_event,time_condition
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1,,,,False
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1,,,,False
2,8007cf507f879f,1858440,65881386,1,US>AL>630,1,,,,False
3,c323a4acee49ea,1859539,81808273,2,US,1,,,,False
4,868e63ea5adfc9,1859942,58142438,2,US>IN>527,2,,,,False


In [148]:
page_views_joined_df1.shape

(43810, 10)

In [179]:
# apply `non-exist` sql condition 
page_views_joined_df2= page_views_joined_df1.loc[(page_views_joined_df1.loc[:, 'document_id_promo'].isnull()) | (page_views_joined_df1.loc[:, 'uuid_event'].isnull()) | 
                                                       (page_views_joined_df1.loc[:, 'time_condition']==False)]
print(page_views_joined_df2.shape)

(43087, 10)


In [180]:
page_views_joined_df2.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,uuid_event,document_id_promo,timestamp_event,time_condition
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1,,,,False
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1,,,,False
2,8007cf507f879f,1858440,65881386,1,US>AL>630,1,,,,False
3,c323a4acee49ea,1859539,81808273,2,US,1,,,,False
4,868e63ea5adfc9,1859942,58142438,2,US>IN>527,2,,,,False


In [182]:
#convert to pandas
page_views_joined_df2 = page_views_joined_df2[page_views_df.columns]
page_views_joined_df2 = page_views_joined_df2.to_pandas()
page_views_joined_df2.head(2)

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1


In [183]:
page_views_train_df = page_views_joined_df2.merge(documents_df,left_on='document_id_pv', right_on='document_id_doc', how='left') 

In [192]:
page_views_train_df.columns

Index(['uuid_pv', 'document_id_pv', 'timestamp_pv', 'platform_pv',
       'geo_location_pv', 'traffic_source_pv', 'document_id_doc', 'source_id',
       'publish_time', 'publisher_id', 'document_id_cat', 'category_id_list',
       'confidence_level_cat_list', 'document_id_top', 'topic_id_list',
       'confidence_level_top_list', 'document_id_ent', 'entity_id_list',
       'confidence_level_ent_list'],
      dtype='object')

In [319]:
print('Processing document frequencies...')

#both dfs here are cudf dataframes
documents_total = documents_meta_df.shape[0]

categories_docs_counts = documents_categories_df.groupby('category_id').size().reset_index(name='count')
categories_docs_counts = dict(categories_docs_counts.values.tolist())
len(categories_docs_counts)

df_filenames_suffix = ''
if evaluation:
    df_filenames_suffix = '_eval'

with open(OUTPUT_BUCKET_FOLDER + 'categories_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
    pickle.dump(categories_docs_counts, output)

topics_docs_counts = documents_topics_df.groupby('topic_id').size().reset_index(name='count')
topics_docs_counts = dict(topics_docs_counts.values.tolist())
len(topics_docs_counts)

with open(OUTPUT_BUCKET_FOLDER + 'topics_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
    pickle.dump(topics_docs_counts, output)
    
entities_docs_counts = documents_entities_df.groupby('entity_id').size().reset_index(name='count')
entities_docs_counts = dict(entities_docs_counts.values.tolist())
len(entities_docs_counts)

with open(OUTPUT_BUCKET_FOLDER + 'entities_docs_counts' + df_filenames_suffix + '.pickle', 'wb') as output:
    pickle.dump(entities_docs_counts, output)

Processing document frequencies...


In [185]:
print('Processing user profiles...')

null_to_empty_list = lambda x: x if isinstance(x, list) else []
null_to_empty_dict = lambda x: x if isinstance(x, dict) else {}
null_to_empty_string = lambda x: x if isinstance(x, str) else ''
null_to_minus_one = lambda x: -1 if np.isnan(x) else x
null_to_zero = lambda x: 0 if np.isnan(x) else x

Processing user profiles...


In [193]:
page_views_train_df.fillna({'timestamp_pv': -1}, inplace=True)

page_views_train_df.loc[:, 'category_id_list']=  page_views_train_df.loc[:, 'category_id_list'].apply(null_to_empty_list) 


page_views_train_df.loc[:, 'confidence_level_cat_list'] = page_views_train_df.loc[:, 
    'confidence_level_cat_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'topic_id_list'] = page_views_train_df.loc[:, 'topic_id_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'confidence_level_top_list'] = page_views_train_df.loc[:, 
    'confidence_level_top_list'].apply(null_to_empty_list)

page_views_train_df.loc[:, 'entity_id_list'] = page_views_train_df.loc[:, 'entity_id_list'].apply(null_to_empty_string)
page_views_train_df.loc[:, 'confidence_level_ent_list'] = page_views_train_df.loc[:, 
    'confidence_level_ent_list'].apply(null_to_empty_list)

page_views_by_user_df = page_views_train_df.groupby('uuid_pv', as_index=False)[
    'document_id_pv', 'timestamp_pv', 'category_id_list', 'confidence_level_cat_list', 'topic_id_list',
    'confidence_level_top_list', 'entity_id_list', 'confidence_level_ent_list'].agg(lambda x: list(x)).rename(
    columns={'document_id_pv': 'document_id_pv_list', 'timestamp_pv': 'timestamp_pv_list', 'category_id_list': 
    'category_id_lists', 'confidence_level_cat_list': 'cat_confidence_level_lists', 'topic_id_list': 
    'topic_id_lists', 'confidence_level_top_list': 'top_confidence_level_lists', 'entity_id_list': 
    'entity_id_lists', 'confidence_level_ent_list': 'ent_confidence_level_lists'})



In [188]:
page_views_train_df.head()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,document_id_doc,source_id,publish_time,publisher_id,document_id_cat,category_id_list,confidence_level_cat_list,document_id_top,topic_id_list,confidence_level_top_list,document_id_ent,entity_id_list,confidence_level_ent_list
0,8b44cec833b8dd,1858440,62205443,2,US>AZ>753,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
1,342737978fbe4f,1858440,64265466,1,US>MD>511,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
2,8007cf507f879f,1858440,65881386,1,US>AL>630,1,1858440,3471,2016-06-14 16:00:00,111,1858440.0,"[1807, 2004]","[0.8337446451187134, 0.06343710422515869]",1858440.0,"[85, 43, 121, 295]","[0.12629671394824982, 0.0830133929848671, 0.01...",,,[]
3,c323a4acee49ea,1859539,81808273,2,US,1,1859539,6698,2016-06-14 16:00:00,784,1859539.0,"[1607, 1610]","[0.4920482039451599, 0.037438444793224335]",1859539.0,[43],[0.010142072103917599],,,[]
4,868e63ea5adfc9,1859942,58142438,2,US>IN>527,2,1859942,12706,2016-06-14 16:00:00,1044,1859942.0,"[1306, 1210]","[0.6837242245674133, 0.05202249437570572]",1859942.0,"[196, 10]","[0.17931514978408813, 0.00934587512165308]",,,[]


In [194]:
page_views_by_user_df.columns

Index(['uuid_pv', 'document_id_pv_list', 'timestamp_pv_list',
       'category_id_lists', 'cat_confidence_level_lists', 'topic_id_lists',
       'top_confidence_level_lists', 'entity_id_lists',
       'ent_confidence_level_lists'],
      dtype='object')

In [332]:
import math
def get_user_aspects(docs_aspects, aspect_docs_counts):
    docs_aspects_merged_lists = defaultdict(list)
  
    for doc_aspects in docs_aspects:
        for key in doc_aspects.keys():
            docs_aspects_merged_lists[key].append(doc_aspects[key])
      
    docs_aspects_stats = {}
    
    for key in docs_aspects_merged_lists.keys():
        print("key:", key)
        aspect_list = docs_aspects_merged_lists[key]
        tf = len(aspect_list)
        idf = math.log(documents_total / float(aspect_docs_counts[key]))
    
        confid_mean = sum(aspect_list) / float(len(aspect_list))
        print([tf*idf, confid_mean])
        docs_aspects_stats[key] = [tf*idf, confid_mean]
      
    return docs_aspects_stats


def generate_user_profile(docs_aspects_list, docs_aspects_confidence_list, aspect_docs_counts = categories_docs_counts):
    docs_aspects = []
    for doc_aspects_list, doc_aspects_confidence_list in zip(docs_aspects_list, docs_aspects_confidence_list):
        doc_aspects = dict(zip(doc_aspects_list, doc_aspects_confidence_list))
        docs_aspects.append(doc_aspects)
      
    user_aspects = get_user_aspects(docs_aspects, aspect_docs_counts)
    return user_aspects

get_list_len_udf = lambda docs_list: len(docs_list)

generate_categories_user_profile_map_udf = lambda row: \
    generate_user_profile(row['category_id_lists'], row['cat_confidence_level_lists'], categories_docs_counts)


generate_topics_user_profile_map_udf = lambda row: \
    generate_user_profile(row['topic_id_lists'], row['top_confidence_level_lists'], topics_docs_counts)


generate_entities_user_profile_map_udf = lambda row: \
    generate_user_profile(row['entity_id_lists'], row['ent_confidence_level_lists'], entities_docs_counts)


In [318]:
#users_profile_df['categories'] = users_profile_df.apply(lambda x: generate_user_profile(x.category_id_lists, x.cat_confidence_level_lists), axis=1)

In [None]:
#user_profiles_pdf['views'] = page_views_by_user_pdf.loc[:, 'document_id_pv_list'].apply(list_len_udf)
from collections import defaultdict

users_profile_df = pd.DataFrame()
users_profile_df.loc[:, 'uuid'] = page_views_by_user_df.loc[:, 'uuid_pv']
users_profile_df.loc[:, 'doc_ids'] = page_views_by_user_df.loc[:, 'document_id_pv_list']
users_profile_df.loc[:, 'views'] = page_views_by_user_df.loc[:, 'document_id_pv_list'].apply(get_list_len_udf)
users_profile_df.loc[:, 'categories'] = page_views_by_user_df.apply(
    generate_categories_user_profile_map_udf, axis=1)

users_profile_df.loc[:, 'topics'] = page_views_by_user_df.apply(
    generate_topics_user_profile_map_udf, axis=1)
users_profile_df.loc[:, 'entities'] = page_views_by_user_df.apply(
    generate_entities_user_profile_map_udf, axis=1)

users_profile_df.shape

In [334]:
users_profile_df.head()

Unnamed: 0,uuid,doc_ids,views,categories,topics,entities
0,100614d664a6c3,[1832823],1,"{1403: [1.6559796338810973, 0.7033445835113525...","{16: [4.737792074229773, 0.055320367217063904]...",{}
1,100ce200bea586,[1584920],1,"{1403: [1.6559796338810973, 0.4359037280082702...","{16: [4.737792074229773, 0.035918738692998886]...",{}
2,100d0ddfadd845,[1811567],1,{},"{62: [6.114908231578814, 0.015740877017378807]}",{}
3,100fd99bcbcb85,[874215],1,"{1708: [3.043104604130009, 0.7546594738960266]...","{36: [6.007345370891915, 0.0373980849981308], ...",{}
4,100fdd39838d25,[343574],1,"{1513: [2.38416956809324, 0.9199999570846558],...","{173: [7.196600149509651, 0.13058622181415558]...",{}


In [336]:
# ## Write user profiles to pickle

if evaluation:
    table_name = 'user_profiles_eval.pickle'
else:
    table_name = 'user_profiles.pickle'

users_profile_df.to_pickle(OUTPUT_BUCKET_FOLDER+table_name)