In [1]:
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle

In [2]:
def temp():
    paths = []
    dirs = os.listdir('train_data')
    for dir in dirs:
        filenames = os.listdir(f'train_data/{dir}')
        paths += [f'train_data/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    return paths
data_filepaths = temp()

# String data encoding

In [None]:
if not os.path.exists('./normalized_data'):
    os.mkdir('./normalized_data')
if not os.path.exists('./label_encoders'):
    os.mkdir('./label_encoders')

In [46]:
def encode_categoric_string_feature(feature_name):
    unique_ids = []
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = df[feature_name].unique()
        del df
        unique_ids = np.unique(np.concatenate([unique_ids, df_unique_ids]))
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

def encode_categoric_string_feature_pandas(feature_name):
    unique_ids = None
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = pd.Series(df[feature_name].unique())
        del df
        if unique_ids is None:
            unique_ids = df_unique_ids
        else:
            unique_ids = pd.Series(pd.concat([unique_ids, df_unique_ids]).unique())
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
]
for feature in features_string_categories:
    encode_categoric_string_feature(feature)

encode_categoric_string_feature_pandas('country_code')
encode_categoric_string_feature_pandas('region')

Country code:   0%|          | 0/69 [00:00<?, ?it/s]

Region:   0%|          | 0/69 [00:00<?, ?it/s]

In [63]:
def normalize_file(encoders, filepath):
    main_dir = os.path.join('.', 'normalized_train_data')
    if not os.path.exists(main_dir):
        os.mkdir(main_dir)
    
    df = pd.read_csv(filepath)

    for column, encoder in tqdm(encoders, desc=filepath):
        df[column] = encoder.transform(df[column].to_list())
    
    directory_path, filename = os.path.split(filepath)
    _, parent_dir = os.path.split(directory_path)
    parent_dir = os.path.join('.', 'normalized_train_data', parent_dir)
    if not os.path.exists(parent_dir):
        os.mkdir(parent_dir)
    df.to_csv(os.path.join(parent_dir, filename))

def load_encoders(features):
    column_encoders = []
    for feature in features:
        with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
            encoder = pickle.load(handle)
        column_encoders.append((feature, encoder))
    return column_encoders

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'country_code',
    'region',
]
column_encoders = load_encoders(features_string_categories)
for filepaths in data_filepaths[10:]:
    normalize_file(column_encoders, filepaths)

train_data/train1/part-00010.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00011.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00012.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00013.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00014.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00015.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00016.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00017.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00018.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00019.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00020.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00021.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00022.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00023.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00024.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00025.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00026.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00027.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00028.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00029.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00030.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00031.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00032.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00033.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00034.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00035.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00036.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00037.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00038.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00039.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00040.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00041.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00042.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00043.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00044.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00045.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00046.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00047.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00048.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00049.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00050.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00051.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00052.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00053.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00054.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00055.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00056.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00057.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00058.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00059.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00060.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00061.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00062.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00063.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00064.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00065.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00066.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00067.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00068.csv:   0%|          | 0/12 [00:00<?, ?it/s]

In [22]:
df_submission = pd.read_csv('./submission_example.csv')
df_test = pd.read_csv('./test_file.csv')

# Test set deviations from training data

In [24]:

def report_cold():
    category_features = [
        'target_id_hash',
        'syndicator_id_hash',
        'campaign_id_hash',
        'target_item_taxonomy',
        'placement_id_hash',
        'publisher_id_hash',
        'source_id_hash',
        'source_item_type',
        'browser_platform',
        'country_code',
        'region',
    ]
    for feature in category_features:
        train_data_unique_vals = pd.read_csv(f'./normalized_data/{feature}.csv')[feature]
        f = pd.Series(df_test[feature].unique())
        cold_count = (~f.isin(train_data_unique_vals)).sum()
        print(f'new {feature}', f'{cold_count} / {len(f.index)}')
report_cold()

new target_id_hash 1941 / 15303
new syndicator_id_hash 37 / 1604
new campaign_id_hash 659 / 7517
new target_item_taxonomy 0 / 61
new placement_id_hash 1 / 1028
new publisher_id_hash 0 / 3
new source_id_hash 7963 / 16956
new source_item_type 0 / 5
new browser_platform 0 / 4
new country_code 0 / 213
new region 2 / 942


# EDA

In [2]:
from pprint import pprint
def read_all_data():
    paths = []
    dirs = os.listdir('normalized_train_data')
    for dir in dirs:
        filenames = os.listdir(f'normalized_train_data/{dir}')
        paths += [f'normalized_train_data/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    dfs = []
    for filepath in tqdm(paths, desc='Loading data'):
        df = pd.read_csv(filepath)
        df.drop(columns=['Unnamed: 0'], inplace=True)
        dfs.append(df)
    dfs = pd.concat(dfs)
    return dfs
data = read_all_data()

  0%|          | 0/69 [00:00<?, ?it/s]

In [8]:
max_timestamp = data['page_view_start_time'].max()
three_days_ago = (datetime.datetime.fromtimestamp(max_timestamp/1000) - datetime.timedelta(days=3)).replace(hour=0, minute=0, second=0, microsecond=0)
three_days_ago = int(datetime.datetime.timestamp(three_days_ago) * 1000)

In [10]:
last_three_days = data[data['page_view_start_time'] >= three_days_ago]
train_set = data[data['page_view_start_time'] < three_days_ago]

In [11]:
not_cold_users_mask = last_three_days['user_id_hash'].isin(train_set['user_id_hash'])

In [14]:
last_three_days_hot_users = last_three_days[not_cold_users_mask]
last_three_days_cold_users = last_three_days[~not_cold_users_mask]

In [19]:
features = [
    'page_view_start_time', 'user_id_hash', 'target_id_hash',
    'syndicator_id_hash', 'campaign_id_hash', 'empiric_calibrated_recs',
    'empiric_clicks', 'target_item_taxonomy', 'placement_id_hash',
    'user_recs', 'user_clicks', 'user_target_recs', 'publisher_id_hash',
    'source_id_hash', 'source_item_type', 'browser_platform', 'os_family',
    'country_code', 'region', 'day_of_week', 'time_of_day', 'gmt_offset'
]

In [20]:
last_three_days_hot_users_features = last_three_days_hot_users[features]
last_three_days_hot_users_ground_truth = last_three_days_hot_users['is_click']
last_three_days_cold_users_features = last_three_days_cold_users[features]
last_three_days_cold_users_ground_truth = last_three_days_cold_users['is_click']

In [21]:
last_three_days_hot_users_features.head()

Unnamed: 0,page_view_start_time,user_id_hash,target_id_hash,syndicator_id_hash,campaign_id_hash,empiric_calibrated_recs,empiric_clicks,target_item_taxonomy,placement_id_hash,user_recs,...,publisher_id_hash,source_id_hash,source_item_type,browser_platform,os_family,country_code,region,day_of_week,time_of_day,gmt_offset
4,1587072791703,11463395,79556,527,14201,4976.148,1974.0,1,616,825.0,...,0,97436,3,0,6,388,1008,4,15,-600
15,1587152706971,7290697,77904,694,3413,346.61606,1761.0,8,360,417.0,...,2,206755,1,0,6,388,1531,5,15,-400
19,1587136150512,176706,94933,81,15872,11604.184,8411.0,29,387,161.0,...,0,158778,3,0,6,388,1553,5,11,-400
23,1587149454878,13089853,9127,2291,12917,5518.3125,7329.0,1,122,516.0,...,2,206755,1,0,6,388,1177,5,8,-1000
31,1587257696247,9961237,4971,1131,22051,974.8269,3868.0,12,814,549.0,...,1,78840,1,0,6,388,1229,6,19,-500


In [30]:
datetime.datetime.fromtimestamp(1587072791703/1000)

datetime.datetime(2020, 4, 17, 0, 33, 11, 703000)

In [37]:
sorted(gmt / 100)

[-11.0,
 -10.0,
 -9.0,
 -8.0,
 -7.0,
 -6.0,
 -5.0,
 -4.0,
 -3.0,
 -2.3,
 -2.0,
 -1.0,
 -0.01,
 0.0,
 1.0,
 2.0,
 3.0,
 3.3,
 4.0,
 4.3,
 5.0,
 5.3,
 5.45,
 6.0,
 6.3,
 7.0,
 8.0,
 9.0,
 9.3,
 10.0,
 10.3,
 11.0,
 12.0,
 13.0,
 14.0,
 99.99]

In [48]:
df_test['page_view_start_time'].min(), df_test['page_view_start_time'].max()

(1587340800179, 1587443159283)

In [49]:
datetime.datetime.fromtimestamp(1587340800179/1000).strftime('%Y-%m-%d %H:%M:%S'), datetime.datetime.fromtimestamp(1587443159283/1000).strftime('%Y-%m-%d %H:%M:%S')

('2020-04-20 03:00:00', '2020-04-21 07:25:59')

In [50]:
data['page_view_start_time'].min(), data['page_view_start_time'].max()

(1586131200051, 1587350802882)

In [51]:
datetime.datetime.fromtimestamp(1586131200051/1000).strftime('%Y-%m-%d %H:%M:%S'), datetime.datetime.fromtimestamp(1587350802882/1000).strftime('%Y-%m-%d %H:%M:%S')

('2020-04-06 03:00:00', '2020-04-20 05:46:42')