In [1]:
import os
import datetime
import argparse
import pandas as pd
import numpy as np
import glob
import pytz
from sklearn.preprocessing import StandardScaler
from gauss_rank import GaussRankScaler
from sklearn.model_selection import train_test_split

In [32]:
DATA_PATH = '/home/gmoreira/dataset/gcom_news'

In [36]:
OUTPUT_PATH = '/home/gmoreira/dataset/gcom_news/clicks_preprocessed'
!mkdir -p $OUTPUT_PATH

In [3]:
def extract_local_hour_weekday(timestamp_in_utc, local_tz):
    dt = pytz.utc.localize(datetime.datetime.utcfromtimestamp(timestamp_in_utc)).astimezone(pytz.timezone(local_tz))
    return dt.hour + (dt.minute/60.0), dt.weekday() #First day is Monday

In [4]:
def get_cicled_feature_value(value, max_value):
    value_scaled = (value + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_sin, value_cos

def get_time_features(timestamp, default_timezone='America/Sao_Paulo'):
    #Converting timestamp to the fDefault timezone, where most clicks originate
    local_hour, local_weekday = extract_local_hour_weekday(int(timestamp)//1000, default_timezone)        
    #Converting hour in two cycling features to represent the continuity of the hours
    local_hour_sin, local_hour_cos = get_cicled_feature_value(local_hour, 24)
    local_weekday_sin, local_weekday_cos = get_cicled_feature_value(local_weekday+1, 7)
    return local_hour_sin, local_hour_cos, local_weekday_sin, local_weekday_cos

In [19]:
def group_sessions(clicks_hour_df):
    def to_list(series):
        return list(series)
    
    #Ensuring that sessions are chronologically ordered
    clicks_hour_df.sort_values(['session_start', 'click_timestamp'], inplace=True)
    sessions_by_hour_df = clicks_hour_df.groupby('session_id').agg({'user_id': min,
                                                                    'session_start': min,
                                                                    'session_size': min,
                                                                    'click_article_id': to_list,
                                                                    'click_timestamp': to_list,
                                                                    'click_environment': to_list,
                                                                    'click_deviceGroup': to_list,
                                                                    'click_os': to_list,
                                                                    'click_country': to_list,
                                                                    'click_region': to_list,
                                                                    'click_referrer_type': to_list,
                                                                    'hour_sin': to_list,
                                                                    'hour_cos': to_list,
                                                                    'weekday_sin': to_list,
                                                                    'weekday_cos': to_list,
                                                                    'item_age_hours': to_list,
                                                                    'item_age_hours_norm': to_list,
                                                                    }
                                                                   ).reset_index()
    return sessions_by_hour_df

In [59]:
def prepare_data(clicks_hour_df):    
    clicks_hour_df = clicks_hour_df.merge(articles_metadata_df, 
                                          left_on='click_article_id', right_on='article_id')
    clicks_hour_df[['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']] = \
            list(clicks_hour_df['click_timestamp'].apply(lambda ts: get_time_features(ts)).values)
    
    clicks_hour_df['item_age_hours'] = (clicks_hour_df['click_timestamp'] - clicks_hour_df['created_at_ts']) / (1000*60*60)
    
    scaler = GaussRankScaler()
    clicks_hour_df['item_age_hours_norm'] = scaler.fit_transform(clicks_hour_df[['item_age_hours']])
    sessions_df = group_sessions(clicks_hour_df)
    return sessions_df

In [25]:
#clicks_hour_df['item_age_hours'].describe(percentiles=np.arange(0.0,1.0,0.1))
'''
count     1883.000000
mean       313.920239
std       2500.257279
min        -20.069912
0%         -20.069912
10%          5.777035
20%          6.705644
30%          7.739401
40%          9.653872
50%         11.343312
60%         12.060842
70%         12.728053
80%         17.131041
90%         19.860367
max      33897.512268
Name: item_age_hours, dtype: float64
'''

'\ncount     1883.000000\nmean       313.920239\nstd       2500.257279\nmin        -20.069912\n0%         -20.069912\n10%          5.777035\n20%          6.705644\n30%          7.739401\n40%          9.653872\n50%         11.343312\n60%         12.060842\n70%         12.728053\n80%         17.131041\n90%         19.860367\nmax      33897.512268\nName: item_age_hours, dtype: float64\n'

In [26]:
articles_metadata_df = pd.read_csv(os.path.join(DATA_PATH, 'articles_metadata.csv'))

In [61]:
counter = 1
for idx, clicks_file_path in enumerate(sorted(glob.glob(f"{DATA_PATH}/clicks/*"))): 
    clicks_hour_df = pd.read_csv(clicks_file_path)
    
    if clicks_hour_df['session_id'].nunique() < 100:
        print("Ignoring this hour file, because has only {} sessions".format(clicks_hour_df['session_id'].nunique()))
        continue
    
    sessions_df = prepare_data(clicks_hour_df)
    print(f"{idx}->{counter}", "# sessions: {}".format(len(sessions_df)))
    
    hour_files_path = os.path.join(OUTPUT_PATH, f"{counter:04}")
    os.makedirs(hour_files_path, exist_ok=True)
    
    #As the number of sessions by hour is small, uses all available sessions for training, 
    #and splits the same data as valid and test, as the evaluation is always performed in the next hour
    train_df = sessions_df
    sessions_df.to_parquet(os.path.join(hour_files_path, "train.parquet"))    
    valid_df, test_df = train_test_split(train_df, test_size=0.5, shuffle=True)
    valid_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "valid.parquet"))    
    test_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "test.parquet")) 
    
    counter += 1

0->1 # sessions: 707
1->2 # sessions: 531
2->3 # sessions: 342
3->4 # sessions: 244
4->5 # sessions: 208
5->6 # sessions: 284
6->7 # sessions: 710
7->8 # sessions: 1250
8->9 # sessions: 1808
9->10 # sessions: 2071
10->11 # sessions: 2046
11->12 # sessions: 1831
12->13 # sessions: 2208
13->14 # sessions: 2007
14->15 # sessions: 2171
15->16 # sessions: 2407
16->17 # sessions: 2198
17->18 # sessions: 2176
18->19 # sessions: 2452
19->20 # sessions: 3504
20->21 # sessions: 4374
21->22 # sessions: 3865
22->23 # sessions: 3602
23->24 # sessions: 2559
24->25 # sessions: 1680
25->26 # sessions: 941
26->27 # sessions: 531
27->28 # sessions: 422
28->29 # sessions: 510
29->30 # sessions: 812
30->31 # sessions: 1813
31->32 # sessions: 2992
32->33 # sessions: 5797
33->34 # sessions: 7025
34->35 # sessions: 5890
35->36 # sessions: 6240
36->37 # sessions: 7320
37->38 # sessions: 7722
38->39 # sessions: 6623
39->40 # sessions: 6112
40->41 # sessions: 6545
41->42 # sessions: 8344
42->43 # sessions: 7199

322->319 # sessions: 1893
323->320 # sessions: 1853
324->321 # sessions: 1785
325->322 # sessions: 1946
326->323 # sessions: 1812
327->324 # sessions: 1792
328->325 # sessions: 1684
329->326 # sessions: 1781
330->327 # sessions: 1535
331->328 # sessions: 1181
332->329 # sessions: 1093
333->330 # sessions: 1400
334->331 # sessions: 1559
335->332 # sessions: 1208
336->333 # sessions: 851
337->334 # sessions: 505
338->335 # sessions: 282
339->336 # sessions: 196
340->337 # sessions: 180
341->338 # sessions: 329
342->339 # sessions: 626
343->340 # sessions: 1050
344->341 # sessions: 1555
345->342 # sessions: 1253
346->343 # sessions: 1218
347->344 # sessions: 1198
348->345 # sessions: 1126
349->346 # sessions: 1009
350->347 # sessions: 1345
351->348 # sessions: 2000
352->349 # sessions: 2199
353->350 # sessions: 3052
354->351 # sessions: 3731
355->352 # sessions: 3303
356->353 # sessions: 2692
357->354 # sessions: 2533
358->355 # sessions: 2007
359->356 # sessions: 1448
360->357 # sessions