In [156]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from gauss_rank import GaussRankScaler
import json
import pytz
import datetime
from sklearn.model_selection import train_test_split

In [109]:
INPUT_CLICKS_PATH = '/home/gmoreira/dataset/adressa/preprocessed/sessions_processed_by_spark/'
INPUT_ARTICLES_PATH = '/home/gmoreira/dataset/adressa/preprocessed/adressa_articles.csv'

In [150]:
OUTPUT_PATH = '/home/gmoreira/dataset/adressa/preprocessed/sessions_by_hour_parquet/'
!mkdir -p $OUTPUT_PATH

In [3]:
def load_sessions_json_file(json_path):
    with open(json_path, 'r') as fi:
        for line in fi:
            yield json.loads(line)

def load_sessions_hour(session_hour_path):
    sessions = []
    for session_file in os.listdir(session_hour_path):
        session_file_path = os.path.join(session_hour_path, session_file)
        sessions_hour = load_sessions_json_file(session_file_path)
        for session in sessions_hour:
            sessions.append(session)        
    return sessions

def load_sessions_hours(folder_path):
    #Sorting hours directories (treating cases where number of digits is lower. E.x. "session_hour=3" < "session_hour=20")
    hour_folders = sorted([path for path in os.listdir(folder_path) \
                             if os.path.isdir(os.path.join(folder_path,path))], 
                          key=lambda x: "{:0>5}".format(x.split('=')[1]))
    
    for hour_folder in hour_folders:
        hour_index = int(hour_folder.split('=')[1])
        hour_folder_path = os.path.join(folder_path, hour_folder)
        sessions_hour = load_sessions_hour(hour_folder_path)
        yield (hour_index, sessions_hour)

In [16]:
def gini_index(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))        

In [162]:
numeric_scalers = {
    '_elapsed_ms_since_last_click': {
                 #Set Maximum of 60 min, just to separate returning users, whose elapsed time since last click will be greater than the max 30-min limit for sessions
                 'valid_max': 60 * 60 * 1000.0, 
                 'avg':    789935.7,
                 'stddev': 1371436.0},
    'active_time_secs': {
                 'valid_max': 900.0,
                 'avg':    65.0,
                 'stddev': 69.37},
    'active_time_secs_by_word': {
                 'valid_max': 10.0,
                 'avg':    1.854,
                 'stddev': 1.474}
}

def standardize_num_feature(feature, values):
    scaler_config = numeric_scalers[feature]
    normalizer = lambda x: (min(int(x), scaler_config['valid_max']) - scaler_config['avg']) / scaler_config['stddev']    
    return  list([normalizer(value) for value in values])

def get_cicled_feature_value(value, max_value):
    value_scaled = (value + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_sin, value_cos

def extract_local_hour_weekday(timestamp_in_utc, local_tz):
    dt = pytz.utc.localize(datetime.datetime.utcfromtimestamp(timestamp_in_utc)).astimezone(pytz.timezone(local_tz))
    return dt.hour + (dt.minute/60.0), dt.weekday() #First day is Monday


def get_time_features(timestamp, default_timezone='Europe/Oslo'):
    #Converting timestamp to the fDefault timezone, where most clicks originate
    local_hour, local_weekday = extract_local_hour_weekday(int(timestamp)//1000, default_timezone)        
    #Converting hour in two cycling features to represent the continuity of the hours
    local_hour_sin, local_hour_cos = get_cicled_feature_value(local_hour, 24)
    local_weekday_sin, local_weekday_cos = get_cicled_feature_value(local_weekday+1, 7)
    return local_hour_sin, local_hour_cos, local_weekday_sin, local_weekday_cos


article_first_time_seen = dict()

def process_session_clicks_features(sessions_hour, articles_df):
    global article_first_time_seen
    sessions = []

    session_count = 0
    clicked_articles_ids = []
    unique_clicked_articles = set()
    #Normalizing numerical features (standardization) and creating time features
    for session in sessions_hour:
        session_count += 1
        for click in session['clicks']:
            
            click['category0_encoded'], click['category1_encoded'], click['author_encoded'] = \
                    articles_df.loc[click['article_id']][['category0_encoded', 'category1_encoded', 'author_encoded']]
            
            if click['article_id'] not in article_first_time_seen:
                article_first_time_seen[click['article_id']] = click['timestamp']
                
            click['item_age_hours'] = (click['timestamp'] - article_first_time_seen[click['article_id']]) / (1000*60*60)
            
            click['hour_sin'], click['hour_cos'], click['weekday_sin'], click['weekday_cos'] = \
                    get_time_features(click['timestamp']//1000)
                
            #Applying standardization on elapsed time
            click['_elapsed_ms_since_last_click'] = standardize_num_feature('_elapsed_ms_since_last_click', [click['_elapsed_ms_since_last_click']])[0]

            #Copying click attributes as lists in the session
            for key in click:
                if key != "user_id":
                    if key not in session:
                        session[key] = [click[key]]
                    else:
                        session[key].append(click[key])

            clicked_articles_ids.append(click['article_id'])
            unique_clicked_articles.add(click['article_id'])

        #Removing clicks property, as its values were copied to individual list columns
        del session['clicks']
        sessions.append(session)

    #Ensuring sessions within the hour are sorted by session id (time)
    sessions_df = pd.DataFrame(sessions).sort_values('session_id')
    
    # Normalize the item_age_hours with GaussRank (considering only the distribution of the current hour)
    scaler = GaussRankScaler()
    item_age_hours_exploded = sessions_df.explode('item_age_hours')[['item_age_hours']]
    clicks_item_age_hours_mean = item_age_hours_exploded['item_age_hours'].mean()
    clicks_item_age_hours_median = item_age_hours_exploded['item_age_hours'].median()
    
    scaler.fit(item_age_hours_exploded)
    
    sessions_df['item_age_hours_norm'] = sessions_df['item_age_hours'].apply(lambda x: np.squeeze(scaler.transform(np.expand_dims(np.array(x), -1))))
    
    #Printing stats
    clicks_by_articles_counter = dict(Counter(clicked_articles_ids))
    clicks_by_articles = np.array(list(clicks_by_articles_counter.values()))
    total_clicks = np.sum(clicks_by_articles)
    clicks_by_articles_norm = clicks_by_articles / total_clicks
    clicks_by_articles_norm_mean = np.mean(clicks_by_articles_norm)
    clicks_by_articles_norm_median = np.median(clicks_by_articles_norm)
    

    stats = {'session_count': session_count,
             'clicks': total_clicks,
             'clicks_by_session': total_clicks / session_count,
             'unique_articles': len(unique_clicked_articles),
             'clicks_by_article':float(total_clicks)/len(unique_clicked_articles),
             'norm_pop_mean': clicks_by_articles_norm_mean,
             'norm_pop_median': clicks_by_articles_norm_median,
             'gini_index': gini_index(clicks_by_articles.astype(np.float32)),
             'clicks_item_age_hours_mean': clicks_item_age_hours_mean,
             'clicks_item_age_hours_median': clicks_item_age_hours_median,
             'clicks_item_age_hours_after_norm_mean': sessions_df.explode('item_age_hours_norm')['item_age_hours_norm'].mean(),
             'clicks_item_age_hours_after_norm_median': sessions_df.explode('item_age_hours_norm')['item_age_hours_norm'].median()
    }

    print("Stats :{}".format(stats))
    
    return sessions_df, stats, clicks_by_articles_counter

In [173]:
articles_df = pd.read_csv(INPUT_ARTICLES_PATH)
articles_df = articles_df[['id_encoded', 'category0_encoded', 'category1_encoded', 'author_encoded']].set_index('id_encoded')
articles_df

Unnamed: 0_level_0,category0_encoded,category1_encoded,author_encoded
id_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,1
2,2,2,65
3,3,3,7
4,3,4,64
5,2,5,106
...,...,...,...
72928,3,8,47
72929,11,24,1
72930,2,13,17
72931,2,2,41


In [175]:
articles_df.agg(['min', 'max'])

Unnamed: 0,category0_encoded,category1_encoded,author_encoded
min,1,1,1
max,40,127,111


In [164]:
counter = 1

for (hour_index, sessions_hour_df) in load_sessions_hours(INPUT_CLICKS_PATH):  
    hour_index += 1
    print('Processing hour {}'.format(hour_index))
    
    sessions_df, hour_stats, hour_clicks_by_articles_counter = \
            process_session_clicks_features(sessions_hour_df, articles_df)
        
    if len(sessions_df) < 100:
        print("Ignoring this hour file, because has only {} sessions".format(len(sessions_df)))
        continue
    
    print(f"{hour_index}->{counter}", "# sessions: {}".format(len(sessions_df)))
    
    hour_files_path = os.path.join(OUTPUT_PATH, f"{counter:04}")
    os.makedirs(hour_files_path, exist_ok=True)
    
    #As the number of sessions by hour is small, uses all available sessions for training, 
    #and splits the same data as valid and test, as the evaluation is always performed in the next hour
    train_df = sessions_df
    sessions_df.to_parquet(os.path.join(hour_files_path, "train.parquet"))    
    valid_df, test_df = train_test_split(train_df, test_size=0.5, shuffle=True)
    valid_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "valid.parquet"))    
    test_df.sort_values(['session_start']).to_parquet(os.path.join(hour_files_path, "test.parquet")) 
    
    counter += 1
    print()

Processing hour 1
Stats :{'session_count': 2239, 'clicks': 5297, 'clicks_by_session': 2.3657882983474767, 'unique_articles': 162, 'clicks_by_article': 32.69753086419753, 'norm_pop_mean': 0.006172839506172839, 'norm_pop_median': 0.00018878610534264677, 'gini_index': 0.8954416303904238, 'clicks_item_age_hours_mean': 0.4808019214230275, 'clicks_item_age_hours_median': 0.4872222222222222, 'clicks_item_age_hours_after_norm_mean': -0.0147975134408703, 'clicks_item_age_hours_after_norm_median': -0.009636918750654422}
1->1 # sessions: 2239

Processing hour 2
Stats :{'session_count': 1665, 'clicks': 3940, 'clicks_by_session': 2.3663663663663663, 'unique_articles': 128, 'clicks_by_article': 30.78125, 'norm_pop_mean': 0.0078125, 'norm_pop_median': 0.0005076142131979696, 'gini_index': 0.8778196373576214, 'clicks_item_age_hours_mean': 1.221225606316979, 'clicks_item_age_hours_median': 1.2709722222222224, 'clicks_item_age_hours_after_norm_mean': -0.006276648308866803, 'clicks_item_age_hours_after_no

Stats :{'session_count': 6746, 'clicks': 17007, 'clicks_by_session': 2.521049510821227, 'unique_articles': 214, 'clicks_by_article': 79.47196261682242, 'norm_pop_mean': 0.004672897196261682, 'norm_pop_median': 5.8799317927912035e-05, 'gini_index': 0.9484865766202497, 'clicks_item_age_hours_mean': 3.1350117108641755, 'clicks_item_age_hours_median': 0.5738888888888889, 'clicks_item_age_hours_after_norm_mean': -0.253820953524194, 'clicks_item_age_hours_after_norm_median': -0.34314195228376315}
17->17 # sessions: 6746

Processing hour 18
Stats :{'session_count': 5355, 'clicks': 13579, 'clicks_by_session': 2.5357609710550886, 'unique_articles': 230, 'clicks_by_article': 59.03913043478261, 'norm_pop_mean': 0.004347826086956521, 'norm_pop_median': 7.364312541424258e-05, 'gini_index': 0.9358302616977101, 'clicks_item_age_hours_mean': 4.23869230265686, 'clicks_item_age_hours_median': 1.46, 'clicks_item_age_hours_after_norm_mean': -0.11774791816647237, 'clicks_item_age_hours_after_norm_median': 

Stats :{'session_count': 2493, 'clicks': 6939, 'clicks_by_session': 2.783393501805054, 'unique_articles': 298, 'clicks_by_article': 23.28523489932886, 'norm_pop_mean': 0.003355704697986577, 'norm_pop_median': 0.0002882259691598213, 'gini_index': 0.8566187018787955, 'clicks_item_age_hours_mean': 12.250268250308231, 'clicks_item_age_hours_median': 11.205555555555556, 'clicks_item_age_hours_after_norm_mean': -0.03686771824575932, 'clicks_item_age_hours_after_norm_median': -0.03426164794555998}
33->33 # sessions: 2493

Processing hour 34
Stats :{'session_count': 2329, 'clicks': 6235, 'clicks_by_session': 2.677114641477029, 'unique_articles': 274, 'clicks_by_article': 22.755474452554743, 'norm_pop_mean': 0.00364963503649635, 'norm_pop_median': 0.00032076984763432237, 'gini_index': 0.8512769319502466, 'clicks_item_age_hours_mean': 12.704123139980394, 'clicks_item_age_hours_median': 11.9475, 'clicks_item_age_hours_after_norm_mean': -0.020598973019932153, 'clicks_item_age_hours_after_norm_medi


Processing hour 49
Stats :{'session_count': 2740, 'clicks': 8153, 'clicks_by_session': 2.9755474452554744, 'unique_articles': 281, 'clicks_by_article': 29.01423487544484, 'norm_pop_mean': 0.0035587188612099642, 'norm_pop_median': 0.00012265423770391266, 'gini_index': 0.8890904501446487, 'clicks_item_age_hours_mean': 7.302424840208749, 'clicks_item_age_hours_median': 5.595, 'clicks_item_age_hours_after_norm_mean': -0.04424007562182279, 'clicks_item_age_hours_after_norm_median': -0.05280294943288544}
49->49 # sessions: 2740

Processing hour 50
Stats :{'session_count': 1248, 'clicks': 3593, 'clicks_by_session': 2.87900641025641, 'unique_articles': 201, 'clicks_by_article': 17.875621890547265, 'norm_pop_mean': 0.004975124378109452, 'norm_pop_median': 0.00027831895352073476, 'gini_index': 0.8482524737791507, 'clicks_item_age_hours_mean': 8.67941885456288, 'clicks_item_age_hours_median': 6.479166666666667, 'clicks_item_age_hours_after_norm_mean': -0.03200820713688207, 'clicks_item_age_hours


Processing hour 65
Stats :{'session_count': 5659, 'clicks': 14994, 'clicks_by_session': 2.649584732284856, 'unique_articles': 304, 'clicks_by_article': 49.32236842105263, 'norm_pop_mean': 0.003289473684210526, 'norm_pop_median': 0.00013338668800853674, 'gini_index': 0.9342223725557902, 'clicks_item_age_hours_mean': 5.991299593911651, 'clicks_item_age_hours_median': 1.4422222222222223, 'clicks_item_age_hours_after_norm_mean': -0.1941793636793079, 'clicks_item_age_hours_after_norm_median': -0.2962552284912074}
65->65 # sessions: 5659

Processing hour 66
Stats :{'session_count': 4866, 'clicks': 13015, 'clicks_by_session': 2.674681463214139, 'unique_articles': 307, 'clicks_by_article': 42.39413680781759, 'norm_pop_mean': 0.0032573289902280127, 'norm_pop_median': 7.68344218209758e-05, 'gini_index': 0.9252851563758827, 'clicks_item_age_hours_mean': 7.23144604516157, 'clicks_item_age_hours_median': 2.41, 'clicks_item_age_hours_after_norm_mean': -0.12950711062990136, 'clicks_item_age_hours_af

Stats :{'session_count': 5004, 'clicks': 14311, 'clicks_by_session': 2.859912070343725, 'unique_articles': 339, 'clicks_by_article': 42.21533923303835, 'norm_pop_mean': 0.0029498525073746312, 'norm_pop_median': 6.987631891551953e-05, 'gini_index': 0.9066219451217674, 'clicks_item_age_hours_mean': 9.698901505446436, 'clicks_item_age_hours_median': 9.418611111111112, 'clicks_item_age_hours_after_norm_mean': -0.13324574214156382, 'clicks_item_age_hours_after_norm_median': -0.15368038880228116}
81->81 # sessions: 5004

Processing hour 82
Stats :{'session_count': 6186, 'clicks': 17295, 'clicks_by_session': 2.7958292919495635, 'unique_articles': 348, 'clicks_by_article': 49.69827586206897, 'norm_pop_mean': 0.0028735632183908046, 'norm_pop_median': 5.782017924255565e-05, 'gini_index': 0.9215484836156966, 'clicks_item_age_hours_mean': 8.804624233079593, 'clicks_item_age_hours_median': 1.6602777777777777, 'clicks_item_age_hours_after_norm_mean': -0.1382202182063233, 'clicks_item_age_hours_after


Processing hour 97
Stats :{'session_count': 2275, 'clicks': 6610, 'clicks_by_session': 2.9054945054945054, 'unique_articles': 283, 'clicks_by_article': 23.356890459363957, 'norm_pop_mean': 0.0035335689045936395, 'norm_pop_median': 0.000151285930408472, 'gini_index': 0.8745887736440674, 'clicks_item_age_hours_mean': 12.143461674230915, 'clicks_item_age_hours_median': 5.2775, 'clicks_item_age_hours_after_norm_mean': -0.02759095003342698, 'clicks_item_age_hours_after_norm_median': -0.03100013307320274}
97->97 # sessions: 2275

Processing hour 98
Stats :{'session_count': 958, 'clicks': 2807, 'clicks_by_session': 2.930062630480167, 'unique_articles': 203, 'clicks_by_article': 13.827586206896552, 'norm_pop_mean': 0.0049261083743842365, 'norm_pop_median': 0.0007125044531528322, 'gini_index': 0.8136169056109626, 'clicks_item_age_hours_mean': 13.90417982820727, 'clicks_item_age_hours_median': 6.621111111111111, 'clicks_item_age_hours_after_norm_mean': -0.025089674802060594, 'clicks_item_age_ho

Processing hour 113
Stats :{'session_count': 4131, 'clicks': 11040, 'clicks_by_session': 2.672476397966594, 'unique_articles': 375, 'clicks_by_article': 29.44, 'norm_pop_mean': 0.0026666666666666666, 'norm_pop_median': 9.057971014492754e-05, 'gini_index': 0.9118942019382537, 'clicks_item_age_hours_mean': 10.60512706320452, 'clicks_item_age_hours_median': 2.6009722222222225, 'clicks_item_age_hours_after_norm_mean': -0.0646741046736481, 'clicks_item_age_hours_after_norm_median': -0.08893905082899742}
113->113 # sessions: 4131

Processing hour 114
Stats :{'session_count': 3583, 'clicks': 9570, 'clicks_by_session': 2.670946134524142, 'unique_articles': 391, 'clicks_by_article': 24.475703324808183, 'norm_pop_mean': 0.0025575447570332474, 'norm_pop_median': 0.0001044932079414838, 'gini_index': 0.8975159467068982, 'clicks_item_age_hours_mean': 12.025529141994648, 'clicks_item_age_hours_median': 3.444166666666667, 'clicks_item_age_hours_after_norm_mean': -0.04999434756792415, 'clicks_item_age_

Processing hour 129
Stats :{'session_count': 5443, 'clicks': 15834, 'clicks_by_session': 2.909057505052361, 'unique_articles': 373, 'clicks_by_article': 42.45040214477212, 'norm_pop_mean': 0.0026809651474530827, 'norm_pop_median': 6.315523556902867e-05, 'gini_index': 0.9138562579819206, 'clicks_item_age_hours_mean': 13.369563123657992, 'clicks_item_age_hours_median': 11.160833333333333, 'clicks_item_age_hours_after_norm_mean': -0.08275867592306639, 'clicks_item_age_hours_after_norm_median': -0.09494495950659422}
129->129 # sessions: 5443

Processing hour 130
Stats :{'session_count': 4594, 'clicks': 13101, 'clicks_by_session': 2.8517631693513277, 'unique_articles': 391, 'clicks_by_article': 33.50639386189258, 'norm_pop_mean': 0.0025575447570332483, 'norm_pop_median': 7.633005114113426e-05, 'gini_index': 0.9058389747213917, 'clicks_item_age_hours_mean': 14.279857326412714, 'clicks_item_age_hours_median': 12.000555555555556, 'clicks_item_age_hours_after_norm_mean': -0.052214070174270745, 

Stats :{'session_count': 1536, 'clicks': 3991, 'clicks_by_session': 2.5983072916666665, 'unique_articles': 269, 'clicks_by_article': 14.8364312267658, 'norm_pop_mean': 0.003717472118959107, 'norm_pop_median': 0.00025056376847907793, 'gini_index': 0.8518311162046052, 'clicks_item_age_hours_mean': 11.819603830841618, 'clicks_item_age_hours_median': 0.8661111111111112, 'clicks_item_age_hours_after_norm_mean': -0.08050313888532896, 'clicks_item_age_hours_after_norm_median': -0.11648895032410767}
145->145 # sessions: 1536

Processing hour 146
Stats :{'session_count': 917, 'clicks': 2471, 'clicks_by_session': 2.6946564885496183, 'unique_articles': 241, 'clicks_by_article': 10.253112033195022, 'norm_pop_mean': 0.004149377593360995, 'norm_pop_median': 0.0004046944556859571, 'gini_index': 0.8145609372548268, 'clicks_item_age_hours_mean': 14.237588358289498, 'clicks_item_age_hours_median': 2.008888888888889, 'clicks_item_age_hours_after_norm_mean': -0.06407581385087106, 'clicks_item_age_hours_af

Stats :{'session_count': 2881, 'clicks': 7179, 'clicks_by_session': 2.491843110031239, 'unique_articles': 305, 'clicks_by_article': 23.537704918032787, 'norm_pop_mean': 0.0032786885245901635, 'norm_pop_median': 0.0001392951664577239, 'gini_index': 0.895462401642526, 'clicks_item_age_hours_mean': 10.433898446085022, 'clicks_item_age_hours_median': 2.8294444444444444, 'clicks_item_age_hours_after_norm_mean': -0.05400976653594213, 'clicks_item_age_hours_after_norm_median': -0.06768971565559626}
161->161 # sessions: 2881

Processing hour 162
Stats :{'session_count': 3421, 'clicks': 8891, 'clicks_by_session': 2.5989476761180943, 'unique_articles': 308, 'clicks_by_article': 28.866883116883116, 'norm_pop_mean': 0.0032467532467532474, 'norm_pop_median': 0.00011247328759419637, 'gini_index': 0.8985363125179362, 'clicks_item_age_hours_mean': 9.703991833189585, 'clicks_item_age_hours_median': 2.236111111111111, 'clicks_item_age_hours_after_norm_mean': -0.06637210297493203, 'clicks_item_age_hours_

Stats :{'session_count': 2128, 'clicks': 6407, 'clicks_by_session': 3.0108082706766917, 'unique_articles': 243, 'clicks_by_article': 26.366255144032923, 'norm_pop_mean': 0.004115226337448559, 'norm_pop_median': 0.0003121585765568909, 'gini_index': 0.8467628951619087, 'clicks_item_age_hours_mean': 17.523525527634607, 'clicks_item_age_hours_median': 12.70138888888889, 'clicks_item_age_hours_after_norm_mean': -0.02455674522356978, 'clicks_item_age_hours_after_norm_median': -0.03568818069220222}
177->177 # sessions: 2128

Processing hour 178
Stats :{'session_count': 4157, 'clicks': 12424, 'clicks_by_session': 2.9886937695453453, 'unique_articles': 301, 'clicks_by_article': 41.27574750830565, 'norm_pop_mean': 0.0033222591362126247, 'norm_pop_median': 0.00016097875080489375, 'gini_index': 0.8807024442319582, 'clicks_item_age_hours_mean': 16.0795254480576, 'clicks_item_age_hours_median': 12.135833333333334, 'clicks_item_age_hours_after_norm_mean': -0.06932207962218746, 'clicks_item_age_hours_

Stats :{'session_count': 966, 'clicks': 2624, 'clicks_by_session': 2.7163561076604554, 'unique_articles': 240, 'clicks_by_article': 10.933333333333334, 'norm_pop_mean': 0.004166666666666667, 'norm_pop_median': 0.00038109756097560977, 'gini_index': 0.7920731680151605, 'clicks_item_age_hours_mean': 21.61185509823847, 'clicks_item_age_hours_median': 6.225, 'clicks_item_age_hours_after_norm_mean': -0.018931747123858995, 'clicks_item_age_hours_after_norm_median': -0.02186170039386539}
193->193 # sessions: 966

Processing hour 194
Stats :{'session_count': 502, 'clicks': 1423, 'clicks_by_session': 2.8346613545816735, 'unique_articles': 210, 'clicks_by_article': 6.776190476190476, 'norm_pop_mean': 0.0047619047619047615, 'norm_pop_median': 0.0007027406886858749, 'gini_index': 0.7344476749518588, 'clicks_item_age_hours_mean': 22.146397868353244, 'clicks_item_age_hours_median': 9.178611111111111, 'clicks_item_age_hours_after_norm_mean': -0.044213354611718984, 'clicks_item_age_hours_after_norm_med

Processing hour 209
Stats :{'session_count': 4524, 'clicks': 11708, 'clicks_by_session': 2.587975243147657, 'unique_articles': 394, 'clicks_by_article': 29.715736040609137, 'norm_pop_mean': 0.0025380710659898475, 'norm_pop_median': 8.541168431841476e-05, 'gini_index': 0.9259528379211457, 'clicks_item_age_hours_mean': 14.42396753881487, 'clicks_item_age_hours_median': 3.1719444444444442, 'clicks_item_age_hours_after_norm_mean': -0.07362401921880471, 'clicks_item_age_hours_after_norm_median': -0.09057752839504007}
209->209 # sessions: 4524

Processing hour 210
Stats :{'session_count': 4480, 'clicks': 11737, 'clicks_by_session': 2.6198660714285715, 'unique_articles': 420, 'clicks_by_article': 27.945238095238096, 'norm_pop_mean': 0.0023809523809523807, 'norm_pop_median': 8.520064752492119e-05, 'gini_index': 0.9217450299590366, 'clicks_item_age_hours_mean': 15.121856238107448, 'clicks_item_age_hours_median': 4.060833333333333, 'clicks_item_age_hours_after_norm_mean': -0.0447908405403872, 'c

Stats :{'session_count': 3684, 'clicks': 9966, 'clicks_by_session': 2.705211726384365, 'unique_articles': 369, 'clicks_by_article': 27.008130081300813, 'norm_pop_mean': 0.0027100271002710027, 'norm_pop_median': 0.00010034115994380895, 'gini_index': 0.9023775677304767, 'clicks_item_age_hours_mean': 16.24111545922802, 'clicks_item_age_hours_median': 10.484166666666667, 'clicks_item_age_hours_after_norm_mean': -0.0480124184144884, 'clicks_item_age_hours_after_norm_median': -0.05524151077863753}
225->225 # sessions: 3684

Processing hour 226
Stats :{'session_count': 3536, 'clicks': 9578, 'clicks_by_session': 2.708710407239819, 'unique_articles': 403, 'clicks_by_article': 23.766749379652605, 'norm_pop_mean': 0.0024813895781637713, 'norm_pop_median': 0.00010440593025683859, 'gini_index': 0.9017755214114667, 'clicks_item_age_hours_mean': 16.86231299737833, 'clicks_item_age_hours_median': 11.049444444444445, 'clicks_item_age_hours_after_norm_mean': -0.05314032716922795, 'clicks_item_age_hours_

Stats :{'session_count': 1681, 'clicks': 4746, 'clicks_by_session': 2.823319452706722, 'unique_articles': 233, 'clicks_by_article': 20.369098712446352, 'norm_pop_mean': 0.004291845493562231, 'norm_pop_median': 0.00021070375052675939, 'gini_index': 0.8579097088224957, 'clicks_item_age_hours_mean': 12.64351576766402, 'clicks_item_age_hours_median': 4.4094444444444445, 'clicks_item_age_hours_after_norm_mean': -0.04729685882917052, 'clicks_item_age_hours_after_norm_median': -0.04543710804740583}
241->241 # sessions: 1681

Processing hour 242
Stats :{'session_count': 715, 'clicks': 2129, 'clicks_by_session': 2.9776223776223776, 'unique_articles': 184, 'clicks_by_article': 11.570652173913043, 'norm_pop_mean': 0.005434782608695651, 'norm_pop_median': 0.0004697040864255519, 'gini_index': 0.8078374186225676, 'clicks_item_age_hours_mean': 14.53434137049214, 'clicks_item_age_hours_median': 5.420833333333333, 'clicks_item_age_hours_after_norm_mean': -0.04088242564876927, 'clicks_item_age_hours_aft

Stats :{'session_count': 2364, 'clicks': 6187, 'clicks_by_session': 2.6171742808798646, 'unique_articles': 355, 'clicks_by_article': 17.428169014084506, 'norm_pop_mean': 0.0028169014084507035, 'norm_pop_median': 0.00016162922256343946, 'gini_index': 0.8722605537599634, 'clicks_item_age_hours_mean': 19.204110231129786, 'clicks_item_age_hours_median': 4.860277777777778, 'clicks_item_age_hours_after_norm_mean': -0.03990273901360985, 'clicks_item_age_hours_after_norm_median': -0.05056112548014824}
257->257 # sessions: 2364

Processing hour 258
Stats :{'session_count': 2011, 'clicks': 5217, 'clicks_by_session': 2.5942317255096965, 'unique_articles': 349, 'clicks_by_article': 14.948424068767908, 'norm_pop_mean': 0.0028653295128939827, 'norm_pop_median': 0.00019168104274487253, 'gini_index': 0.8568988404317825, 'clicks_item_age_hours_mean': 22.365286509914213, 'clicks_item_age_hours_median': 5.850833333333333, 'clicks_item_age_hours_after_norm_mean': -0.026420631456903012, 'clicks_item_age_ho

Stats :{'session_count': 3295, 'clicks': 9089, 'clicks_by_session': 2.758421851289833, 'unique_articles': 349, 'clicks_by_article': 26.04297994269341, 'norm_pop_mean': 0.0028653295128939827, 'norm_pop_median': 0.00011002310485201893, 'gini_index': 0.8943901130575285, 'clicks_item_age_hours_mean': 16.959533013043867, 'clicks_item_age_hours_median': 10.2125, 'clicks_item_age_hours_after_norm_mean': -0.043168656346001616, 'clicks_item_age_hours_after_norm_median': -0.04316655986766677}
273->273 # sessions: 3295

Processing hour 274
Stats :{'session_count': 2591, 'clicks': 7086, 'clicks_by_session': 2.734851408722501, 'unique_articles': 396, 'clicks_by_article': 17.893939393939394, 'norm_pop_mean': 0.0025252525252525255, 'norm_pop_median': 0.00014112334180073385, 'gini_index': 0.8774992357315674, 'clicks_item_age_hours_mean': 19.688155439520777, 'clicks_item_age_hours_median': 11.101388888888888, 'clicks_item_age_hours_after_norm_mean': -0.028864619585672716, 'clicks_item_age_hours_after_n


Processing hour 289
Stats :{'session_count': 2387, 'clicks': 7178, 'clicks_by_session': 3.007121910347717, 'unique_articles': 264, 'clicks_by_article': 27.189393939393938, 'norm_pop_mean': 0.003787878787878787, 'norm_pop_median': 0.00013931457230426304, 'gini_index': 0.8910000664928838, 'clicks_item_age_hours_mean': 11.165137495743185, 'clicks_item_age_hours_median': 3.3779166666666667, 'clicks_item_age_hours_after_norm_mean': -0.03266579061239604, 'clicks_item_age_hours_after_norm_median': -0.05577812742278858}
289->289 # sessions: 2387

Processing hour 290
Stats :{'session_count': 983, 'clicks': 2992, 'clicks_by_session': 3.0437436419125126, 'unique_articles': 148, 'clicks_by_article': 20.216216216216218, 'norm_pop_mean': 0.006756756756756757, 'norm_pop_median': 0.00033422459893048126, 'gini_index': 0.8330141624247607, 'clicks_item_age_hours_mean': 10.686728776737974, 'clicks_item_age_hours_median': 4.376250000000001, 'clicks_item_age_hours_after_norm_mean': -0.023471559458327282, '

Stats :{'session_count': 2780, 'clicks': 7243, 'clicks_by_session': 2.6053956834532372, 'unique_articles': 302, 'clicks_by_article': 23.983443708609272, 'norm_pop_mean': 0.003311258278145695, 'norm_pop_median': 0.00013806433798149937, 'gini_index': 0.8892106822048207, 'clicks_item_age_hours_mean': 19.736327680365687, 'clicks_item_age_hours_median': 3.917777777777778, 'clicks_item_age_hours_after_norm_mean': -0.04884225190370994, 'clicks_item_age_hours_after_norm_median': -0.06740787997841598}
305->305 # sessions: 2780

Processing hour 306
Stats :{'session_count': 3380, 'clicks': 9073, 'clicks_by_session': 2.684319526627219, 'unique_articles': 327, 'clicks_by_article': 27.74617737003058, 'norm_pop_mean': 0.0030581039755351682, 'norm_pop_median': 0.00011021712774165106, 'gini_index': 0.9031252106629587, 'clicks_item_age_hours_mean': 15.782064764808913, 'clicks_item_age_hours_median': 4.012222222222222, 'clicks_item_age_hours_after_norm_mean': -0.06974353565527709, 'clicks_item_age_hours_

Stats :{'session_count': 3553, 'clicks': 10057, 'clicks_by_session': 2.8305657191106106, 'unique_articles': 329, 'clicks_by_article': 30.56838905775076, 'norm_pop_mean': 0.00303951367781155, 'norm_pop_median': 9.943323058566172e-05, 'gini_index': 0.9079057870976107, 'clicks_item_age_hours_mean': 18.76564678554458, 'clicks_item_age_hours_median': 10.304166666666667, 'clicks_item_age_hours_after_norm_mean': -0.056511357859432905, 'clicks_item_age_hours_after_norm_median': -0.05592436534400107}
321->321 # sessions: 3553

Processing hour 322
Stats :{'session_count': 4947, 'clicks': 13740, 'clicks_by_session': 2.777440873256519, 'unique_articles': 348, 'clicks_by_article': 39.48275862068966, 'norm_pop_mean': 0.0028735632183908046, 'norm_pop_median': 0.0001455604075691412, 'gini_index': 0.9135981855963641, 'clicks_item_age_hours_mean': 19.717571344816406, 'clicks_item_age_hours_median': 10.675555555555555, 'clicks_item_age_hours_after_norm_mean': -0.06456310664978908, 'clicks_item_age_hours_

Stats :{'session_count': 1823, 'clicks': 4797, 'clicks_by_session': 2.6313768513439384, 'unique_articles': 293, 'clicks_by_article': 16.372013651877133, 'norm_pop_mean': 0.0034129692832764505, 'norm_pop_median': 0.00020846362309776944, 'gini_index': 0.859904617204587, 'clicks_item_age_hours_mean': 21.47781362193968, 'clicks_item_age_hours_median': 4.958888888888889, 'clicks_item_age_hours_after_norm_mean': -0.03131131635218886, 'clicks_item_age_hours_after_norm_median': -0.04473180265169329}
337->337 # sessions: 1823

Processing hour 338
Stats :{'session_count': 1139, 'clicks': 3046, 'clicks_by_session': 2.6742756804214225, 'unique_articles': 244, 'clicks_by_article': 12.48360655737705, 'norm_pop_mean': 0.004098360655737705, 'norm_pop_median': 0.00032829940906106366, 'gini_index': 0.8336598364006732, 'clicks_item_age_hours_mean': 23.898726927847086, 'clicks_item_age_hours_median': 6.327500000000001, 'clicks_item_age_hours_after_norm_mean': -0.026771735759081636, 'clicks_item_age_hours_

Stats :{'session_count': 1795, 'clicks': 4638, 'clicks_by_session': 2.583844011142061, 'unique_articles': 308, 'clicks_by_article': 15.058441558441558, 'norm_pop_mean': 0.003246753246753247, 'norm_pop_median': 0.00021561017680034498, 'gini_index': 0.8612100471793528, 'clicks_item_age_hours_mean': 28.480881725839645, 'clicks_item_age_hours_median': 5.546805555555556, 'clicks_item_age_hours_after_norm_mean': -0.04432683789863099, 'clicks_item_age_hours_after_norm_median': -0.0494787243615987}
353->353 # sessions: 1795

Processing hour 354
Stats :{'session_count': 2032, 'clicks': 5384, 'clicks_by_session': 2.6496062992125986, 'unique_articles': 378, 'clicks_by_article': 14.243386243386244, 'norm_pop_mean': 0.0026455026455026454, 'norm_pop_median': 0.00018573551263001485, 'gini_index': 0.8610600073032009, 'clicks_item_age_hours_mean': 26.838101628281276, 'clicks_item_age_hours_median': 2.946388888888889, 'clicks_item_age_hours_after_norm_mean': -0.04513228113218045, 'clicks_item_age_hours_

Stats :{'session_count': 2896, 'clicks': 8212, 'clicks_by_session': 2.835635359116022, 'unique_articles': 356, 'clicks_by_article': 23.06741573033708, 'norm_pop_mean': 0.0028089887640449437, 'norm_pop_median': 0.00012177301509985388, 'gini_index': 0.8784062225585003, 'clicks_item_age_hours_mean': 23.397954145694694, 'clicks_item_age_hours_median': 11.850000000000001, 'clicks_item_age_hours_after_norm_mean': -0.0275407535096081, 'clicks_item_age_hours_after_norm_median': -0.029496794728110398}
369->369 # sessions: 2896

Processing hour 370
Stats :{'session_count': 2779, 'clicks': 7839, 'clicks_by_session': 2.820798848506657, 'unique_articles': 362, 'clicks_by_article': 21.654696132596683, 'norm_pop_mean': 0.0027624309392265192, 'norm_pop_median': 0.00012756729174639623, 'gini_index': 0.8719411147031553, 'clicks_item_age_hours_mean': 24.45213193292793, 'clicks_item_age_hours_median': 12.453888888888889, 'clicks_item_age_hours_after_norm_mean': -0.0267548721224145, 'clicks_item_age_hours_




### Checking outputs

In [165]:
train_df = pd.read_parquet(os.path.join(OUTPUT_PATH, '0001/train.parquet'))
len(train_df)

2239

In [166]:
train_df.columns

Index(['session_id', 'session_size', 'session_start', 'user_id',
       '_elapsed_ms_since_last_click', 'active_time_secs', 'article_id',
       'city', 'country', 'device', 'os', 'referrer_class', 'region',
       'timestamp', 'url', 'category0_encoded', 'category1_encoded',
       'author_encoded', 'item_age_hours', 'hour_sin', 'hour_cos',
       'weekday_sin', 'weekday_cos', 'item_age_hours_norm'],
      dtype='object')

In [171]:
train_df.head()

Unnamed: 0,session_id,session_size,session_start,user_id,_elapsed_ms_since_last_click,active_time_secs,article_id,city,country,device,...,url,category0_encoded,category1_encoded,author_encoded,item_age_hours,hour_sin,hour_cos,weekday_sin,weekday_cos,item_age_hours_norm
0,148322520900340,2,1483225209000,cx:221xclq7aa1yx2slhf0xq7svbf:l1m6d375szc,"[-0.575991661295168, -0.5052628777427455]",[92],"[45691, 11117]","[3, 3]","[2, 2]","[4, 4]",...,[http://www.adressa.no/nyheter/2016/12/31/Se-l...,"[2, 2]","[1, 2]","[4, 47]","[0.0, 0.0]","[0.9659258940477027, 0.9659258940477027]","[0.25881879222372195, 0.25881879222372195]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[-1.083718732685902, -1.083718732685902]"
1,148322521000195,2,1483225210000,cx:ikr4x9pbm4bgz3kh:tw6xssyxkj15,"[-0.575991661295168, -0.5358877118582274]",,"[45691, 1763]","[3, 3]","[2, 2]","[4, 4]",...,[http://www.adressa.no/nyheter/2016/12/31/Se-l...,"[2, 5]","[1, 53]","[4, 60]","[0.0002777777777777778, 0.0]","[0.9659258940477027, 0.9659258940477027]","[0.25881879222372195, 0.25881879222372195]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[-1.0817379171796213, -1.083718732685902]"
2,148322523500284,2,1483225235000,cx:insyehp3a0zmucrw:3iuy03ssah3gs,"[-0.575991661295168, -0.3915134938852414]",[47],"[11117, 45691]","[2, 2]","[2, 2]","[2, 2]",...,[http://www.adressa.no/nyheter/trondheim/2016/...,"[2, 2]","[2, 1]","[47, 4]","[-0.01972222222222222, 0.0775]","[0.9659258940477027, 0.9659258940477027]","[0.25881879222372195, 0.25881879222372195]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[-1.1209269983636971, -0.8643317194776033]"
3,148322523600331,2,1483225236000,cx:101hv49lafias30hxmt0tt8am0:3qw2ysixhcbwy,"[-0.575991661295168, -0.34411791727794805]","[17, 2]","[7090, 44743]","[644, 644]","[3, 3]","[3, 3]",...,[http://www.adressa.no/kamera/article7186931.e...,"[27, 27]","[1, 1]","[100, 100]","[0.0, 0.0]","[0.9659258940477027, 0.9659258940477027]","[0.25881879222372195, 0.25881879222372195]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[-1.083718732685902, -1.083718732685902]"
4,148322523900427,2,1483225239000,cx:i946i2jhoixihtyu:1ad4eauxl3fq7,"[-0.575991661295168, -0.4425548840777112]","[35, 139]","[45691, 30346]","[3, 3]","[2, 2]","[2, 2]",...,[http://www.adressa.no/nyheter/2016/12/31/Se-l...,"[2, 5]","[1, 6]","[4, 31]","[0.008333333333333333, 0.0]","[0.9659258940477027, 0.9659258940477027]","[0.25881879222372195, 0.25881879222372195]","[8.975979006501141e-07, 8.975979006501141e-07]","[0.9999999999995972, 0.9999999999995972]","[-1.0529935942684951, -1.083718732685902]"


In [167]:
valid_df = pd.read_parquet(os.path.join(OUTPUT_PATH, '0001/valid.parquet'))
len(valid_df)

1119

In [168]:
test_df = pd.read_parquet(os.path.join(OUTPUT_PATH, '0001/test.parquet'))
len(test_df)

1120

In [169]:
assert not (valid_df['session_id'].isin(test_df['session_id'])).any()