In [1]:
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle

In [2]:
def temp():
    paths = []
    dirs = os.listdir('train_data')
    for dir in dirs:
        filenames = os.listdir(f'train_data/{dir}')
        paths += [f'train_data/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    return paths
data_filepaths = temp()

# String data encoding

In [None]:
if not os.path.exists('./normalized_data'):
    os.mkdir('./normalized_data')
if not os.path.exists('./label_encoders'):
    os.mkdir('./label_encoders')

In [46]:
def encode_categoric_string_feature(feature_name):
    unique_ids = []
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = df[feature_name].unique()
        del df
        unique_ids = np.unique(np.concatenate([unique_ids, df_unique_ids]))
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

def encode_categoric_string_feature_pandas(feature_name):
    unique_ids = None
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = pd.Series(df[feature_name].unique())
        del df
        if unique_ids is None:
            unique_ids = df_unique_ids
        else:
            unique_ids = pd.Series(pd.concat([unique_ids, df_unique_ids]).unique())
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
]
for feature in features_string_categories:
    encode_categoric_string_feature(feature)

encode_categoric_string_feature_pandas('country_code')
encode_categoric_string_feature_pandas('region')

Country code:   0%|          | 0/69 [00:00<?, ?it/s]

Region:   0%|          | 0/69 [00:00<?, ?it/s]

In [63]:
def normalize_file(encoders, filepath):
    main_dir = os.path.join('.', 'normalized_train_data')
    if not os.path.exists(main_dir):
        os.mkdir(main_dir)
    
    df = pd.read_csv(filepath)

    for column, encoder in tqdm(encoders, desc=filepath):
        df[column] = encoder.transform(df[column].to_list())
    
    directory_path, filename = os.path.split(filepath)
    _, parent_dir = os.path.split(directory_path)
    parent_dir = os.path.join('.', 'normalized_train_data', parent_dir)
    if not os.path.exists(parent_dir):
        os.mkdir(parent_dir)
    df.to_csv(os.path.join(parent_dir, filename))

def load_encoders(features):
    column_encoders = []
    for feature in features:
        with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
            encoder = pickle.load(handle)
        column_encoders.append((feature, encoder))
    return column_encoders

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'country_code',
    'region',
]
column_encoders = load_encoders(features_string_categories)
for filepaths in data_filepaths[10:]:
    normalize_file(column_encoders, filepaths)

train_data/train1/part-00010.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00011.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00012.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00013.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00014.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00015.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00016.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00017.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00018.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00019.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00020.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00021.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00022.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00023.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00024.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00025.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00026.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00027.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00028.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00029.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00030.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00031.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00032.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00033.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00034.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00035.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00036.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00037.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00038.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00039.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00040.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00041.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00042.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00043.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00044.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00045.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00046.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00047.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00048.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00049.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00050.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00051.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00052.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00053.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00054.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00055.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00056.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00057.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00058.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00059.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00060.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00061.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00062.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00063.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00064.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00065.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00066.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00067.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00068.csv:   0%|          | 0/12 [00:00<?, ?it/s]

In [21]:
df_submission = pd.read_csv('./submission_example.csv')
df_test = pd.read_csv('./test_file.csv')

# Test set deviations from training data

In [24]:

def report_cold():
    category_features = [
        'target_id_hash',
        'syndicator_id_hash',
        'campaign_id_hash',
        'target_item_taxonomy',
        'placement_id_hash',
        'publisher_id_hash',
        'source_id_hash',
        'source_item_type',
        'browser_platform',
        'country_code',
        'region',
    ]
    for feature in category_features:
        train_data_unique_vals = pd.read_csv(f'./normalized_data/{feature}.csv')[feature]
        f = pd.Series(df_test[feature].unique())
        cold_count = (~f.isin(train_data_unique_vals)).sum()
        print(f'new {feature}', f'{cold_count} / {len(f.index)}')
report_cold()

new target_id_hash 1941 / 15303
new syndicator_id_hash 37 / 1604
new campaign_id_hash 659 / 7517
new target_item_taxonomy 0 / 61
new placement_id_hash 1 / 1028
new publisher_id_hash 0 / 3
new source_id_hash 7963 / 16956
new source_item_type 0 / 5
new browser_platform 0 / 4
new country_code 0 / 213
new region 2 / 942


# EDA

In [2]:
from pprint import pprint
def read_all_data():
    paths = []
    dirs = os.listdir('normalized_train_data')
    for dir in dirs:
        filenames = os.listdir(f'normalized_train_data/{dir}')
        paths += [f'normalized_train_data/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    dfs = []
    for filepath in tqdm(paths, desc='Loading data'):
        df = pd.read_csv(filepath)
        df.drop(columns=['Unnamed: 0'], inplace=True)
        dfs.append(df)
    dfs = pd.concat(dfs)
    return dfs
data = read_all_data()

Loading data:   0%|          | 0/69 [00:00<?, ?it/s]

In [9]:
mem = data.memory_usage(index=False, deep=True)

In [11]:
mem.sum()

5566893872

In [15]:
len([n for n in os.listdir('./normalized_data') if 'hash' in n])

7

In [7]:
print(pd.read_csv('./train_data/train1/part-00000.csv').head(10).to_string())

   page_view_start_time                                                                                                                      user_id_hash                                                                                                                    target_id_hash                                                                                                                syndicator_id_hash                                                                                                                  campaign_id_hash  empiric_calibrated_recs  empiric_clicks target_item_taxonomy                                                                                                                 placement_id_hash  user_recs  user_clicks  user_target_recs                                                                                                                 publisher_id_hash                                                                                             

In [8]:
len('939e2ccfb21e0ec9b0787c394dea1ae935369da0bcc5978b705f2969b02f42e1dd017c17ff4affd6c8f26e5e4e87c4c943bbbb72dc58b6c69f82cfde529bf3f9')

128

In [4]:
len(data.index)

30254858

In [3]:
datetime.datetime.fromtimestamp(data['page_view_start_time'].max()/1000), datetime.datetime.fromtimestamp(data['page_view_start_time'].min()/1000)

(datetime.datetime(2020, 4, 20, 5, 46, 42, 882000),
 datetime.datetime(2020, 4, 6, 3, 0, 0, 51000))

In [16]:
max_timestamp = data['page_view_start_time'].max()
three_days_ago = (datetime.datetime.fromtimestamp(max_timestamp/1000) - datetime.timedelta(days=3)).replace(hour=0, minute=0, second=0, microsecond=0)
three_days_ago = int(datetime.datetime.timestamp(three_days_ago) * 1000)

In [17]:
last_three_days = data[data['page_view_start_time'] >= three_days_ago]
train_set = data[data['page_view_start_time'] < three_days_ago]

In [6]:
not_cold_users_mask = last_three_days['user_id_hash'].isin(train_set['user_id_hash'])

In [7]:
last_three_days_hot_users = last_three_days[not_cold_users_mask]
last_three_days_cold_users = last_three_days[~not_cold_users_mask]

In [18]:
features = [
    'page_view_start_time', 'user_id_hash', 'target_id_hash',
    'syndicator_id_hash', 'campaign_id_hash', 'empiric_calibrated_recs',
    'empiric_clicks', 'target_item_taxonomy', 'placement_id_hash',
    'user_recs', 'user_clicks', 'user_target_recs', 'publisher_id_hash',
    'source_id_hash', 'source_item_type', 'browser_platform', 'os_family',
    'country_code', 'region', 'day_of_week', 'time_of_day', 'gmt_offset'
]

In [9]:
last_three_days_hot_users_features = last_three_days_hot_users[features]
last_three_days_hot_users_ground_truth = last_three_days_hot_users['is_click']
last_three_days_cold_users_features = last_three_days_cold_users[features]
last_three_days_cold_users_ground_truth = last_three_days_cold_users['is_click']

In [10]:
last_three_days_hot_users_features.head()

Unnamed: 0,page_view_start_time,user_id_hash,target_id_hash,syndicator_id_hash,campaign_id_hash,empiric_calibrated_recs,empiric_clicks,target_item_taxonomy,placement_id_hash,user_recs,...,publisher_id_hash,source_id_hash,source_item_type,browser_platform,os_family,country_code,region,day_of_week,time_of_day,gmt_offset
4,1587072791703,11463395,79556,527,14201,4976.148,1974.0,1,616,825.0,...,0,97436,3,0,6,388,1008,4,15,-600
15,1587152706971,7290697,77904,694,3413,346.61606,1761.0,8,360,417.0,...,2,206755,1,0,6,388,1531,5,15,-400
19,1587136150512,176706,94933,81,15872,11604.184,8411.0,29,387,161.0,...,0,158778,3,0,6,388,1553,5,11,-400
23,1587149454878,13089853,9127,2291,12917,5518.3125,7329.0,1,122,516.0,...,2,206755,1,0,6,388,1177,5,8,-1000
31,1587257696247,9961237,4971,1131,22051,974.8269,3868.0,12,814,549.0,...,1,78840,1,0,6,388,1229,6,19,-500


In [11]:
datetime.datetime.fromtimestamp(1587340800179/1000).strftime('%Y-%m-%d %H:%M:%S'), datetime.datetime.fromtimestamp(1587443159283/1000).strftime('%Y-%m-%d %H:%M:%S')

('2020-04-20 03:00:00', '2020-04-21 07:25:59')

In [12]:
datetime.datetime.fromtimestamp(1586131200051/1000).strftime('%Y-%m-%d %H:%M:%S'), datetime.datetime.fromtimestamp(1587350802882/1000).strftime('%Y-%m-%d %H:%M:%S')

('2020-04-06 03:00:00', '2020-04-20 05:46:42')

In [19]:
def load_encoders(features):
    column_encoders = []
    for feature in features:
        with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
            encoder = pickle.load(handle)
        column_encoders.append((feature, encoder))
    return column_encoders

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'country_code',
    'region',
]
column_encoders = load_encoders(features_string_categories)

In [20]:
def reconstruct():
    df = last_three_days.copy(deep=True)
    for column, encoder in tqdm(column_encoders):
        df[column] = encoder.inverse_transform(df[column].to_list())
    return df
last_three_days_recon = reconstruct()
last_three_days_recon['country_code'].fillna('Null', inplace=True)
last_three_days_recon['region'].fillna('Null', inplace=True)

  0%|          | 0/12 [00:00<?, ?it/s]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  last_three_days_recon['country_code'].fillna('Null', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  last_three_days_recon['region'].fillna('Null', inplace=True)


In [23]:
[c for c in last_three_days_recon['region'].unique() if c.startswith('U')]

['UT',
 'UNKNOWN',
 'U',
 'UP',
 'UN',
 'US',
 'UD',
 'UCA',
 'UR',
 'UQ',
 'UIG',
 'UE',
 'ULY']

In [235]:
from itertools import combinations
def categorize_time(dt):
    hour = dt.hour
    if 5 <= hour < 11:
        return "morning"
    elif 11 <= hour < 13:
        return "midday"
    elif 13 <= hour < 18:
        return "afternoon"
    elif 18 <= hour < 24:
        return "evening"
    else:  # Hour is between 00:00 and 05:00
        return "night"
    
def format_time_offset(t):
    if len(t) == 1:
        return '+00:00:00'
    elif len(t) == 2: 
        return f'{t}:00:00'
    else:
        return f'{t[:-2]}:{t[-2:]}:00'

def apply_gmt_offset():
    gmt_offsets = last_three_days_recon['gmt_offset'].replace(9999, 0).astype(str).apply(format_time_offset)
    dt_series = pd.to_datetime(last_three_days_recon['page_view_start_time'], unit='us')
    gmt_offsets = pd.to_timedelta(gmt_offsets)
    dt_with_offset = dt_series + gmt_offsets
    last_three_days_recon['local_day_part'] = dt_with_offset.apply(categorize_time)

def add_feature_ctr(feature_1):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    latest_views = last_three_days.iloc[last_three_days.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = last_three_days.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby(f'{feature_1}_id_hash').agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_1}_ctr']]
    temp_df = last_three_days[[f'{feature_1}_id_hash']].merge(stats, on=f'{feature_1}_id_hash', how='left')
    last_three_days_recon[f'{feature_1}_ctr'] = temp_df[f'{feature_1}_ctr']

def add_pair_feature_ctr(feature_1, feature_2):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    latest_views = last_three_days.iloc[last_three_days.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = last_three_days.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby([f'{feature_1}_id_hash', f'{feature_2}_id_hash']).agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_{feature_2}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_2}_id_hash', f'{feature_1}_{feature_2}_ctr']]
    temp_df = last_three_days[[f'{feature_1}_id_hash', f'{feature_2}_id_hash']].merge(stats, on=[f'{feature_1}_id_hash', f'{feature_2}_id_hash'], how='left')
    last_three_days_recon[f'{feature_1}_{feature_2}_ctr'] = temp_df[f'{feature_1}_{feature_2}_ctr']

def generate_ctrs():
    # generate ctr for estimate for features_ctr
    features_ctr = ['campaign', 'syndicator', 'placement', 'publisher', 'source']
    for feature in features_ctr:
        add_feature_ctr(feature)
    # generate ctr for pair wise estimate
    for feature_1, feature_2 in list(combinations(features_ctr, 2)):
        add_pair_feature_ctr(feature_1, feature_2)

# generate general target item taxonomy
last_three_days_recon['target_item_taxonomy_upper'] = last_three_days_recon['target_item_taxonomy'].str.split('~').str[0]
# target item ctr
last_three_days_recon['target_item_ctr'] = last_three_days_recon['empiric_clicks'].divide(last_three_days_recon['empiric_calibrated_recs'], fill_value=0) 
# user ctr
last_three_days_recon['user_ctr'] = last_three_days_recon['user_clicks'].divide(last_three_days_recon['user_recs'], fill_value=0)
# target item frequency
last_three_days_recon['item_presented_to_user_frequency'] = last_three_days_recon['user_target_recs'].divide(last_three_days_recon['user_recs'], fill_value=0)
# aggregation
last_three_days_recon['item_be_clicked_by_user'] = last_three_days_recon['user_ctr'] * last_three_days_recon['target_item_ctr'] * last_three_days_recon['item_presented_to_user_frequency']
# take the timestamp, shift to the local time by gmt offset, look at the hour of the day and generate categorical feature "local part of the day"
apply_gmt_offset()
# binary feature for cold and warm user
last_three_days_recon['user_type'] = (last_three_days_recon['user_recs'] > 0).apply(lambda x: 'warm' if x else 'cold')
# generate "category ctr"
generate_ctrs()

In [236]:
last_three_days_recon.columns

Index(['page_view_start_time', 'user_id_hash', 'target_id_hash',
       'syndicator_id_hash', 'campaign_id_hash', 'empiric_calibrated_recs',
       'empiric_clicks', 'target_item_taxonomy', 'placement_id_hash',
       'user_recs', 'user_clicks', 'user_target_recs', 'publisher_id_hash',
       'source_id_hash', 'source_item_type', 'browser_platform', 'os_family',
       'country_code', 'region', 'day_of_week', 'time_of_day', 'gmt_offset',
       'is_click', 'target_item_taxonomy_upper', 'target_item_ctr', 'user_ctr',
       'item_presented_to_user_frequency', 'item_be_clicked_by_user',
       'local_day_part', 'user_type', 'campaign_ctr', 'syndicator_ctr',
       'placement_ctr', 'publisher_ctr', 'source_ctr',
       'campaign_syndicator_ctr', 'campaign_placement_ctr',
       'campaign_publisher_ctr', 'campaign_source_ctr',
       'syndicator_placement_ctr', 'syndicator_publisher_ctr',
       'syndicator_source_ctr', 'placement_publisher_ctr',
       'placement_source_ctr', 'publisher_s

In [237]:
last_three_days_features = last_three_days_recon[[
    # 'user_id_hash',
    # 'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'empiric_calibrated_recs',
    'empiric_clicks',
    'target_item_taxonomy',
    'placement_id_hash',
    'user_recs',
    'user_clicks',
    'user_target_recs',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'os_family',
    'country_code',
    'region',
    'day_of_week',
    'time_of_day',
    'gmt_offset',
    'target_item_taxonomy_upper',
    'target_item_ctr',
    'user_ctr',
    'item_presented_to_user_frequency',
    'item_be_clicked_by_user',
    'local_day_part',
    'user_type',
    'campaign_ctr',
    'syndicator_ctr',
    'placement_ctr',
    'publisher_ctr',
    'source_ctr',
    'campaign_syndicator_ctr',
    'campaign_placement_ctr',
    'campaign_publisher_ctr',
    'campaign_source_ctr',
    'syndicator_placement_ctr',
    'syndicator_publisher_ctr',
    'syndicator_source_ctr',
    'placement_publisher_ctr',
    'placement_source_ctr',
    'publisher_source_ctr',
]].to_numpy()
last_three_days_ground_truth = last_three_days_recon['is_click'].to_numpy()

In [238]:
features_type = {
    # 'user_id_hash': 'cat',
    # 'target_id_hash': 'cat',
    'syndicator_id_hash': 'cat',
    'campaign_id_hash': 'cat',
    'empiric_calibrated_recs': '',
    'empiric_clicks': '',
    'target_item_taxonomy': 'cat',
    'placement_id_hash': 'cat',
    'user_recs': '',
    'user_clicks': '',
    'user_target_recs': '',
    'publisher_id_hash': 'cat',
    'source_id_hash': 'cat',
    'source_item_type': 'cat',
    'browser_platform': 'cat',
    'os_family': 'cat',
    'country_code': 'cat',
    'region': 'cat',
    'day_of_week': '',
    'time_of_day': '',
    'gmt_offset': 'cat',
    'target_item_taxonomy_upper': 'cat',
    'target_item_ctr': '',
    'user_ctr': '',
    'item_presented_to_user_frequency': '',
    'item_be_clicked_by_user': '',
    'local_day_part': 'cat',
    'user_type': 'cat',
    'campaign_ctr': '',
    'syndicator_ctr': '',
    'placement_ctr': '',
    'publisher_ctr': '',
    'source_ctr': '',
    'campaign_syndicator_ctr': '',
    'campaign_placement_ctr': '',
    'campaign_publisher_ctr': '',
    'campaign_source_ctr': '',
    'syndicator_placement_ctr': '',
    'syndicator_publisher_ctr': '',
    'syndicator_source_ctr': '',
    'placement_publisher_ctr': '',
    'placement_source_ctr': '',
    'publisher_source_ctr': '',
}
def temp():
    cat_ids = []
    for i, (feature, t) in enumerate(features_type.items()):
        if t == 'cat':
            print(i, feature)
            cat_ids.append(i)
    return cat_ids
cat_ids = temp()
cat_ids

0 syndicator_id_hash
1 campaign_id_hash
4 target_item_taxonomy
5 placement_id_hash
9 publisher_id_hash
10 source_id_hash
11 source_item_type
12 browser_platform
13 os_family
14 country_code
15 region
18 gmt_offset
19 target_item_taxonomy_upper
24 local_day_part
25 user_type


[0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25]

In [241]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                        #    class_weights=,
                           devices='0:1')
"""
0 user_id_hash
1 target_id_hash
2 syndicator_id_hash
3 campaign_id_hash
6 target_item_taxonomy
7 placement_id_hash
11 publisher_id_hash
12 source_id_hash
13 source_item_type
14 browser_platform
15 os_family
16 country_code
17 region
20 gmt_offset
"""
cat_features = [0, 1, 2, 3, 6, 7, 11, 12, 13, 14, 15, 16, 17, 20]
cat_features = [0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25]
model.fit(last_three_days_features, last_three_days_ground_truth, cat_features, verbose=True)
model.save_model(
    os.path.join('.', 'models', 'cboost-three-days-no-uid-tid-taxonomy-splitted-uctr-tctr-freq-many-ctr.cbm'),
    format="cbm",
    export_parameters=None,
    pool=None
)

Learning rate set to 0.020912
0:	learn: 0.6837794	total: 2.14s	remaining: 35m 42s
1:	learn: 0.6748222	total: 4.37s	remaining: 36m 18s
2:	learn: 0.6663215	total: 6.63s	remaining: 36m 44s
3:	learn: 0.6582334	total: 8.79s	remaining: 36m 29s
4:	learn: 0.6505360	total: 11s	remaining: 36m 23s
5:	learn: 0.6430874	total: 13.8s	remaining: 37m 58s
6:	learn: 0.6361225	total: 15.9s	remaining: 37m 39s
7:	learn: 0.6293766	total: 17.9s	remaining: 36m 53s
8:	learn: 0.6230195	total: 20.2s	remaining: 37m 8s
9:	learn: 0.6170151	total: 22.6s	remaining: 37m 16s
10:	learn: 0.6111772	total: 24.6s	remaining: 36m 53s
11:	learn: 0.6056289	total: 27.2s	remaining: 37m 18s
12:	learn: 0.6004158	total: 29.5s	remaining: 37m 21s
13:	learn: 0.5955082	total: 32.3s	remaining: 37m 53s
14:	learn: 0.5907473	total: 34.3s	remaining: 37m 32s
15:	learn: 0.5861295	total: 36.4s	remaining: 37m 21s
16:	learn: 0.5818630	total: 38.7s	remaining: 37m 17s
17:	learn: 0.5777413	total: 41.4s	remaining: 37m 39s
18:	learn: 0.5740846	total: 4

In [247]:
def load_encoders():
    feature = 'target_id_hash'
    with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
        encoder = pickle.load(handle)
        return encoder
target_id_encoder = load_encoders()

In [244]:
df_test_normed = pd.read_csv('./test_file_normalized.csv')
df_test_normed.drop(columns=['Unnamed: 0'], inplace=True)

In [249]:
def norm_target_id():
    unique_values = df_test['target_id_hash'].unique()
    value_to_code = dict()
    for value in tqdm(unique_values):
        try:
            mapped_value = target_id_encoder.transform([value])[0]
        except ValueError as err:
            mapped_value = target_id_encoder.classes_.shape[0]
        value_to_code[value] = mapped_value
    result = df_test['target_id_hash'].apply(lambda value: value_to_code[value])
    return result
target_id_normed = norm_target_id()

  0%|          | 0/15303 [00:00<?, ?it/s]

In [251]:
df_test_normed['target_id_hash'] = target_id_normed
df_test_normed.to_csv('./test_file_normalized_with_target_id.csv')

In [257]:
def add_feature_ctr(feature_1, df_normed, df_to_apply_on):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    used_columns = [
        f'{feature_1}_id_hash',
        'target_id_hash', 'page_view_start_time', 'empiric_calibrated_recs'
    ]
    data = pd.concat([last_three_days[used_columns], df_normed[used_columns]])
    latest_views = data.iloc[data.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = data.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby(f'{feature_1}_id_hash').agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_1}_ctr']]
    temp_df = data[[f'{feature_1}_id_hash']].merge(stats, on=f'{feature_1}_id_hash', how='left')
    df_to_apply_on[f'{feature_1}_ctr'] = temp_df[f'{feature_1}_ctr']

def add_pair_feature_ctr(feature_1, feature_2, df_normed, df_to_apply_on):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    used_columns = [
        f'{feature_1}_id_hash', f'{feature_2}_id_hash',
        'target_id_hash', 'page_view_start_time', 'empiric_calibrated_recs'
    ]
    data = pd.concat([last_three_days[used_columns], df_normed[used_columns]])
    latest_views = data.iloc[data.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = data.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby([f'{feature_1}_id_hash', f'{feature_2}_id_hash']).agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_{feature_2}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_2}_id_hash', f'{feature_1}_{feature_2}_ctr']]
    temp_df = data[[f'{feature_1}_id_hash', f'{feature_2}_id_hash']].merge(stats, on=[f'{feature_1}_id_hash', f'{feature_2}_id_hash'], how='left')
    df_to_apply_on[f'{feature_1}_{feature_2}_ctr'] = temp_df[f'{feature_1}_{feature_2}_ctr']

def get_test_features():
    testset = df_test.copy(deep=True)
    testset['country_code'].fillna('Null', inplace=True)
    testset['region'].fillna('Null', inplace=True)
    test_features = testset[[
        'page_view_start_time',
        # 'user_id_hash',
        # 'target_id_hash',
        'syndicator_id_hash',
        'campaign_id_hash',
        'empiric_calibrated_recs',
        'empiric_clicks',
        'target_item_taxonomy',
        'placement_id_hash',
        'user_recs',
        'user_clicks',
        'user_target_recs',
        'publisher_id_hash',
        'source_id_hash',
        'source_item_type',
        'browser_platform',
        'os_family',
        'country_code',
        'region',
        'day_of_week',
        'time_of_day',
        'gmt_offset'
    ]]
    # generate general target item taxonomy
    test_features['target_item_taxonomy_upper'] = test_features['target_item_taxonomy'].str.split('~').str[0]
    # target item ctr
    test_features['target_item_ctr'] = test_features['empiric_clicks'].divide(test_features['empiric_calibrated_recs'], fill_value=0) 
    # user ctr
    test_features['user_ctr'] = test_features['user_clicks'].divide(test_features['user_recs'], fill_value=0)
    # target item frequency
    test_features['item_presented_to_user_frequency'] = test_features['user_target_recs'].divide(test_features['user_recs'], fill_value=0)
    # aggregation
    test_features['item_be_clicked_by_user'] = test_features['user_ctr'] * test_features['target_item_ctr'] * test_features['item_presented_to_user_frequency']
    # take the timestamp, shift to the local time by gmt offset, look at the hour of the day and generate categorical feature "local part of the day"
    # apply_gmt_offset
    gmt_offsets = test_features['gmt_offset'].replace(9999, 0).astype(str).apply(format_time_offset)
    dt_series = pd.to_datetime(test_features['page_view_start_time'], unit='us')
    gmt_offsets = pd.to_timedelta(gmt_offsets)
    dt_with_offset = dt_series + gmt_offsets
    test_features['local_day_part'] = dt_with_offset.apply(categorize_time)

    # binary feature for cold and warm user
    test_features['user_type'] = (test_features['user_recs'] > 0).apply(lambda x: 'warm' if x else 'cold')
    # generate "category ctr"
    # generate ctr for estimate for features_ctr
    features_ctr = ['campaign', 'syndicator', 'placement', 'publisher', 'source']
    for feature in features_ctr:
        add_feature_ctr(feature, df_test_normed, test_features)
    # generate ctr for pair wise estimate
    for feature_1, feature_2 in list(combinations(features_ctr, 2)):
        add_pair_feature_ctr(feature_1, feature_2, df_test_normed, test_features)
    return test_features
test_features = get_test_features()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  testset['country_code'].fillna('Null', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  testset['region'].fillna('Null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the 

In [259]:
test_features = test_features[[
    # 'user_id_hash',
    # 'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'empiric_calibrated_recs',
    'empiric_clicks',
    'target_item_taxonomy',
    'placement_id_hash',
    'user_recs',
    'user_clicks',
    'user_target_recs',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'os_family',
    'country_code',
    'region',
    'day_of_week',
    'time_of_day',
    'gmt_offset',
    'target_item_taxonomy_upper',
    'target_item_ctr',
    'user_ctr',
    'item_presented_to_user_frequency',
    'item_be_clicked_by_user',
    'local_day_part',
    'user_type',
    'campaign_ctr',
    'syndicator_ctr',
    'placement_ctr',
    'publisher_ctr',
    'source_ctr',
    'campaign_syndicator_ctr',
    'campaign_placement_ctr',
    'campaign_publisher_ctr',
    'campaign_source_ctr',
    'syndicator_placement_ctr',
    'syndicator_publisher_ctr',
    'syndicator_source_ctr',
    'placement_publisher_ctr',
    'placement_source_ctr',
    'publisher_source_ctr',
]].to_numpy()

In [143]:
preds_proba = model.predict_proba(test_features)

In [144]:
def report_submission():
    pred = preds_proba[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('my_submission.csv',index=False)
report_submission()