In [4]:
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt

from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle
ASSAF_STORAGE = '/sise/assafzar-group/assafzar/mark/taboola-competition/'

## Subsetting the data - taking X last days

Loading all previously normalized data, now fits into RAM

In [5]:
from pprint import pprint
assaf_path = '/sise/assafzar-group/assafzar/mark/taboola-competition'
def read_all_data():
    pre_path = f'{assaf_path}/normalized_train_data'
    paths = []
    dirs = os.listdir(pre_path)
    for dir in dirs:
        filenames = os.listdir(f'{pre_path}/{dir}')
        paths += [f'{pre_path}/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    dfs = []
    for filepath in tqdm(paths, desc='Loading data'):
        df = pd.read_csv(filepath)
        df.drop(columns=['Unnamed: 0'], inplace=True)
        dfs.append(df)
    dfs = pd.concat(dfs)
    return dfs
data = read_all_data()

Loading data: 100%|██████████| 69/69 [01:37<00:00,  1.42s/it]


Subsetting the data by a temporal threshold - final model was on the last 5 days of the data

In [6]:
max_timestamp = data['page_view_start_time'].max()
five_days_ago = (datetime.datetime.fromtimestamp(max_timestamp/1000) - datetime.timedelta(days=5)).replace(hour=0, minute=0, second=0, microsecond=0)
five_days_ago = int(datetime.datetime.timestamp(five_days_ago) * 1000)
last_x_days = data[data['page_view_start_time'] >= five_days_ago]
last_x_days.shape

(10926180, 23)

In [8]:
features = [
    'page_view_start_time', 'user_id_hash', 'target_id_hash',
    'syndicator_id_hash', 'campaign_id_hash', 'empiric_calibrated_recs',
    'empiric_clicks', 'target_item_taxonomy', 'placement_id_hash',
    'user_recs', 'user_clicks', 'user_target_recs', 'publisher_id_hash',
    'source_id_hash', 'source_item_type', 'browser_platform', 'os_family',
    'country_code', 'region', 'day_of_week', 'time_of_day', 'gmt_offset'
]

Loading the encoders to translate the data back from labels

In [11]:
def load_encoders(features):
    column_encoders = []
    for feature in features:
        with open(f'{assaf_path}/label_encoders/{feature}.pickle', 'rb') as handle:
            encoder = pickle.load(handle)
        column_encoders.append((feature, encoder))
    return column_encoders

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'country_code',
    'region',
]
column_encoders = load_encoders(features_string_categories)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Recontructing the subset back to hashed format

In [12]:
def reconstruct():
    df = last_x_days.copy(deep=True)
    for column, encoder in tqdm(column_encoders):
        df[column] = encoder.inverse_transform(df[column].to_list())
    return df
last_x_days_recon = reconstruct()
last_x_days_recon['country_code'].fillna('Null', inplace=True)
last_x_days_recon['region'].fillna('Null', inplace=True)

100%|██████████| 12/12 [00:26<00:00,  2.19s/it]


# Contextual featuer extraction

In [13]:
from itertools import combinations

def categorize_time(dt):
    hour = dt.hour
    if 5 <= hour < 11:
        return "morning"
    elif 11 <= hour < 13:
        return "midday"
    elif 13 <= hour < 18:
        return "afternoon"
    elif 18 <= hour < 24:
        return "evening"
    else:  # Hour is between 00:00 and 05:00
        return "night"
    
def format_time_offset(t):
    if len(t) == 1:
        return '+00:00:00'
    elif len(t) == 2: 
        return f'{t}:00:00'
    else:
        return f'{t[:-2]}:{t[-2:]}:00'

def apply_gmt_offset():
    gmt_offsets = last_x_days_recon['gmt_offset'].replace(9999, 0).astype(str).apply(format_time_offset)
    dt_series = pd.to_datetime(last_x_days_recon['page_view_start_time'], unit='us')
    gmt_offsets = pd.to_timedelta(gmt_offsets)
    dt_with_offset = dt_series + gmt_offsets
    last_x_days_recon['local_day_part'] = dt_with_offset.apply(categorize_time)

def add_feature_ctr(feature_1):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    latest_views = last_x_days.iloc[last_x_days.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = last_x_days.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby(f'{feature_1}_id_hash').agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_1}_ctr']]
    temp_df = last_x_days[[f'{feature_1}_id_hash']].merge(stats, on=f'{feature_1}_id_hash', how='left')
    last_x_days_recon[f'{feature_1}_ctr'] = temp_df[f'{feature_1}_ctr']

def add_pair_feature_ctr(feature_1, feature_2):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    latest_views = last_x_days.iloc[last_x_days.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = last_x_days.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby([f'{feature_1}_id_hash', f'{feature_2}_id_hash']).agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_{feature_2}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_2}_id_hash', f'{feature_1}_{feature_2}_ctr']]
    temp_df = last_x_days[[f'{feature_1}_id_hash', f'{feature_2}_id_hash']].merge(stats, on=[f'{feature_1}_id_hash', f'{feature_2}_id_hash'], how='left')
    last_x_days_recon[f'{feature_1}_{feature_2}_ctr'] = temp_df[f'{feature_1}_{feature_2}_ctr']

def generate_ctrs():
    # generate ctr for estimate for features_ctr
    features_ctr = ['campaign', 'syndicator', 'placement', 'publisher', 'source']
    for feature in features_ctr:
        add_feature_ctr(feature)
    # generate ctr for pair wise estimate
    for feature_1, feature_2 in list(combinations(features_ctr, 2)):
        add_pair_feature_ctr(feature_1, feature_2)

# generate general target item taxonomy
last_x_days_recon['target_item_taxonomy_upper'] = last_x_days_recon['target_item_taxonomy'].str.split('~').str[0]
# target item ctr
last_x_days_recon['target_item_ctr'] = last_x_days_recon['empiric_clicks'].divide(last_x_days_recon['empiric_calibrated_recs'], fill_value=0) 
# user ctr
last_x_days_recon['user_ctr'] = last_x_days_recon['user_clicks'].divide(last_x_days_recon['user_recs'], fill_value=0)
# target item frequency
last_x_days_recon['item_presented_to_user_frequency'] = last_x_days_recon['user_target_recs'].divide(last_x_days_recon['user_recs'], fill_value=0)
# aggregation
last_x_days_recon['item_be_clicked_by_user'] = last_x_days_recon['user_ctr'] * last_x_days_recon['target_item_ctr'] * last_x_days_recon['item_presented_to_user_frequency']
# take the timestamp, shift to the local time by gmt offset, look at the hour of the day and generate categorical feature "local part of the day"
apply_gmt_offset()
# binary feature for cold and warm user
last_x_days_recon['user_type'] = (last_x_days_recon['user_recs'] > 0).apply(lambda x: 'warm' if x else 'cold')
# generate "category ctr"
generate_ctrs()

In [14]:
last_x_days_recon.columns

Index(['page_view_start_time', 'user_id_hash', 'target_id_hash',
       'syndicator_id_hash', 'campaign_id_hash', 'empiric_calibrated_recs',
       'empiric_clicks', 'target_item_taxonomy', 'placement_id_hash',
       'user_recs', 'user_clicks', 'user_target_recs', 'publisher_id_hash',
       'source_id_hash', 'source_item_type', 'browser_platform', 'os_family',
       'country_code', 'region', 'day_of_week', 'time_of_day', 'gmt_offset',
       'is_click', 'target_item_taxonomy_upper', 'target_item_ctr', 'user_ctr',
       'item_presented_to_user_frequency', 'item_be_clicked_by_user',
       'local_day_part', 'user_type', 'campaign_ctr', 'syndicator_ctr',
       'placement_ctr', 'publisher_ctr', 'source_ctr',
       'campaign_syndicator_ctr', 'campaign_placement_ctr',
       'campaign_publisher_ctr', 'campaign_source_ctr',
       'syndicator_placement_ctr', 'syndicator_publisher_ctr',
       'syndicator_source_ctr', 'placement_publisher_ctr',
       'placement_source_ctr', 'publisher_s

In [15]:
## Transforming to numpy for faster computation

last_x_days_features = last_x_days_recon[[
    # 'user_id_hash',
    # 'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'empiric_calibrated_recs',
    'empiric_clicks',
    'target_item_taxonomy',
    'placement_id_hash',
    'user_recs',
    'user_clicks',
    'user_target_recs',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'os_family',
    'country_code',
    'region',
    'day_of_week',
    'time_of_day',
    'gmt_offset',
    'target_item_taxonomy_upper',
    'target_item_ctr',
    'user_ctr',
    'item_presented_to_user_frequency',
    'item_be_clicked_by_user',
    'local_day_part',
    'user_type',
    'campaign_ctr',
    'syndicator_ctr',
    'placement_ctr',
    'publisher_ctr',
    'source_ctr',
    'campaign_syndicator_ctr',
    'campaign_placement_ctr',
    'campaign_publisher_ctr',
    'campaign_source_ctr',
    'syndicator_placement_ctr',
    'syndicator_publisher_ctr',
    'syndicator_source_ctr',
    'placement_publisher_ctr',
    'placement_source_ctr',
    'publisher_source_ctr',
]].to_numpy()
last_x_days_ground_truth = last_x_days_recon['is_click'].to_numpy()

In [16]:
# create a vector of indices for categorial features, so that catboost wont treat the labeled columns as numeric
features_type = {
    # 'user_id_hash': 'cat',
    # 'target_id_hash': 'cat',
    'syndicator_id_hash': 'cat',
    'campaign_id_hash': 'cat',
    'empiric_calibrated_recs': '',
    'empiric_clicks': '',
    'target_item_taxonomy': 'cat',
    'placement_id_hash': 'cat',
    'user_recs': '',
    'user_clicks': '',
    'user_target_recs': '',
    'publisher_id_hash': 'cat',
    'source_id_hash': 'cat',
    'source_item_type': 'cat',
    'browser_platform': 'cat',
    'os_family': 'cat',
    'country_code': 'cat',
    'region': 'cat',
    'day_of_week': '',
    'time_of_day': '',
    'gmt_offset': 'cat',
    'target_item_taxonomy_upper': 'cat',
    'target_item_ctr': '',
    'user_ctr': '',
    'item_presented_to_user_frequency': '',
    'item_be_clicked_by_user': '',
    'local_day_part': 'cat',
    'user_type': 'cat',
    'campaign_ctr': '',
    'syndicator_ctr': '',
    'placement_ctr': '',
    'publisher_ctr': '',
    'source_ctr': '',
    'campaign_syndicator_ctr': '',
    'campaign_placement_ctr': '',
    'campaign_publisher_ctr': '',
    'campaign_source_ctr': '',
    'syndicator_placement_ctr': '',
    'syndicator_publisher_ctr': '',
    'syndicator_source_ctr': '',
    'placement_publisher_ctr': '',
    'placement_source_ctr': '',
    'publisher_source_ctr': '',
}
def temp():
    cat_ids = []
    for i, (feature, t) in enumerate(features_type.items()):
        if t == 'cat':
            print(i, feature)
            cat_ids.append(i)
    return cat_ids
cat_ids = temp()
cat_ids

0 syndicator_id_hash
1 campaign_id_hash
4 target_item_taxonomy
5 placement_id_hash
9 publisher_id_hash
10 source_id_hash
11 source_item_type
12 browser_platform
13 os_family
14 country_code
15 region
18 gmt_offset
19 target_item_taxonomy_upper
24 local_day_part
25 user_type


[0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25]

## Running Vanilla CatBoost to check perfomance of new features

In [17]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0:1')
"""
0 user_id_hash
1 target_id_hash
2 syndicator_id_hash
3 campaign_id_hash
6 target_item_taxonomy
7 placement_id_hash
11 publisher_id_hash
12 source_id_hash
13 source_item_type
14 browser_platform
15 os_family
16 country_code
17 region
20 gmt_offset
"""

cat_features = [0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25]
model.fit(last_x_days_features, last_x_days_ground_truth, cat_features, verbose=True)
model.save_model(
    os.path.join('.', 'models', 'cboost-five-days-no-uid-tid-taxonomy-splitted-uctr-tctr-freq-many-ctr.cbm'),
    format="cbm",
    export_parameters=None,
    pool=None
)

Learning rate set to 0.020214
0:	learn: 0.6838871	total: 1.98s	remaining: 32m 55s
1:	learn: 0.6750777	total: 3.95s	remaining: 32m 50s
2:	learn: 0.6667699	total: 6.21s	remaining: 34m 22s
3:	learn: 0.6588244	total: 8.48s	remaining: 35m 11s
4:	learn: 0.6511539	total: 10.5s	remaining: 34m 39s
5:	learn: 0.6439338	total: 12.6s	remaining: 34m 42s
6:	learn: 0.6370113	total: 14.5s	remaining: 34m 22s
7:	learn: 0.6304445	total: 16.8s	remaining: 34m 45s
8:	learn: 0.6242112	total: 18.8s	remaining: 34m 28s
9:	learn: 0.6183053	total: 20.8s	remaining: 34m 20s
10:	learn: 0.6125305	total: 22.8s	remaining: 34m 8s
11:	learn: 0.6071363	total: 25.1s	remaining: 34m 23s
12:	learn: 0.6019534	total: 27s	remaining: 34m 12s
13:	learn: 0.5970472	total: 29.6s	remaining: 34m 43s
14:	learn: 0.5923089	total: 32.1s	remaining: 35m 10s
15:	learn: 0.5877969	total: 34.9s	remaining: 35m 43s
16:	learn: 0.5835776	total: 37s	remaining: 35m 39s
17:	learn: 0.5795404	total: 39.4s	remaining: 35m 51s
18:	learn: 0.5756660	total: 41.

Repeating the pipeline for the test set - normalization and context features addition

In [18]:
def load_encoders():
    feature = 'target_id_hash'
    with open(f'{assaf_path}/label_encoders/{feature}.pickle', 'rb') as handle:
        encoder = pickle.load(handle)
        return encoder
target_id_encoder = load_encoders()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [22]:
df_submission = pd.read_csv('./submission_example.csv')
df_test = pd.read_csv(f'{assaf_path}/test_file.csv')
df_test_normed = pd.read_csv(f'{assaf_path}/test_file_normalized_with_target_id.csv')
df_test_normed.drop(columns=['Unnamed: 0'], inplace=True)

In [23]:
def norm_target_id():
    unique_values = df_test['target_id_hash'].unique()
    value_to_code = dict()
    for value in tqdm(unique_values):
        try:
            mapped_value = target_id_encoder.transform([value])[0]
        except ValueError as err:
            mapped_value = target_id_encoder.classes_.shape[0]
        value_to_code[value] = mapped_value
    result = df_test['target_id_hash'].apply(lambda value: value_to_code[value])
    return result


# target_id_normed = norm_target_id()
# df_test_normed['target_id_hash'] = target_id_normed
# df_test_normed.to_csv('./test_file_normalized_with_target_id.csv')

We concatted the train data (last_x_days) and the test data, and than re-run the context feature extraction. for new categorial values it created a new value, but for previously seen categories it 'retained' the value of the context (last_x_days)

In [24]:
def add_feature_ctr(feature_1, df_normed, df_to_apply_on):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    used_columns = [
        f'{feature_1}_id_hash',
        'target_id_hash', 'page_view_start_time', 'empiric_calibrated_recs'
    ]
    data = pd.concat([last_x_days[used_columns], df_normed[used_columns]])
    latest_views = data.iloc[data.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = data.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby(f'{feature_1}_id_hash').agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_1}_ctr']]
    temp_df = data[[f'{feature_1}_id_hash']].merge(stats, on=f'{feature_1}_id_hash', how='left')
    df_to_apply_on[f'{feature_1}_ctr'] = temp_df[f'{feature_1}_ctr']

def add_pair_feature_ctr(feature_1, feature_2, df_normed, df_to_apply_on):
    """
    reference:
    https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8
    """
    used_columns = [
        f'{feature_1}_id_hash', f'{feature_2}_id_hash',
        'target_id_hash', 'page_view_start_time', 'empiric_calibrated_recs'
    ]
    data = pd.concat([last_x_days[used_columns], df_normed[used_columns]])
    latest_views = data.iloc[data.groupby('target_id_hash')['page_view_start_time'].idxmax()]
    latest_views = data.drop_duplicates(subset='target_id_hash', keep='first')
    stats = latest_views.groupby([f'{feature_1}_id_hash', f'{feature_2}_id_hash']).agg({'empiric_calibrated_recs': 'sum', 'target_id_hash': 'count'})
    average_views_of_distinct_ad = np.log2(stats['empiric_calibrated_recs'].sum() / stats['target_id_hash'].sum())
    average_views_of_distict_ad_in_feature = np.log2(1 + (stats['empiric_calibrated_recs'] / stats['target_id_hash']))
    stats[f'{feature_1}_{feature_2}_ctr'] = np.minimum((average_views_of_distict_ad_in_feature / average_views_of_distinct_ad), 1)
    stats = stats.reset_index()[[f'{feature_1}_id_hash', f'{feature_2}_id_hash', f'{feature_1}_{feature_2}_ctr']]
    temp_df = data[[f'{feature_1}_id_hash', f'{feature_2}_id_hash']].merge(stats, on=[f'{feature_1}_id_hash', f'{feature_2}_id_hash'], how='left')
    df_to_apply_on[f'{feature_1}_{feature_2}_ctr'] = temp_df[f'{feature_1}_{feature_2}_ctr']

def get_test_features():
    testset = df_test.copy(deep=True)
    testset['country_code'].fillna('Null', inplace=True)
    testset['region'].fillna('Null', inplace=True)
    test_features = testset[[
        'page_view_start_time',
        # 'user_id_hash',
        # 'target_id_hash',
        'syndicator_id_hash',
        'campaign_id_hash',
        'empiric_calibrated_recs',
        'empiric_clicks',
        'target_item_taxonomy',
        'placement_id_hash',
        'user_recs',
        'user_clicks',
        'user_target_recs',
        'publisher_id_hash',
        'source_id_hash',
        'source_item_type',
        'browser_platform',
        'os_family',
        'country_code',
        'region',
        'day_of_week',
        'time_of_day',
        'gmt_offset'
    ]]
    # generate general target item taxonomy
    test_features['target_item_taxonomy_upper'] = test_features['target_item_taxonomy'].str.split('~').str[0]
    # target item ctr
    test_features['target_item_ctr'] = test_features['empiric_clicks'].divide(test_features['empiric_calibrated_recs'], fill_value=0) 
    # user ctr
    test_features['user_ctr'] = test_features['user_clicks'].divide(test_features['user_recs'], fill_value=0)
    # target item frequency
    test_features['item_presented_to_user_frequency'] = test_features['user_target_recs'].divide(test_features['user_recs'], fill_value=0)
    # aggregation
    test_features['item_be_clicked_by_user'] = test_features['user_ctr'] * test_features['target_item_ctr'] * test_features['item_presented_to_user_frequency']
    # take the timestamp, shift to the local time by gmt offset, look at the hour of the day and generate categorical feature "local part of the day"
    # apply_gmt_offset
    gmt_offsets = test_features['gmt_offset'].replace(9999, 0).astype(str).apply(format_time_offset)
    dt_series = pd.to_datetime(test_features['page_view_start_time'], unit='us')
    gmt_offsets = pd.to_timedelta(gmt_offsets)
    dt_with_offset = dt_series + gmt_offsets
    test_features['local_day_part'] = dt_with_offset.apply(categorize_time)

    # binary feature for cold and warm user
    test_features['user_type'] = (test_features['user_recs'] > 0).apply(lambda x: 'warm' if x else 'cold')
    # generate "category ctr"
    # generate ctr for estimate for features_ctr
    features_ctr = ['campaign', 'syndicator', 'placement', 'publisher', 'source']
    for feature in features_ctr:
        add_feature_ctr(feature, df_test_normed, test_features)
    # generate ctr for pair wise estimate
    for feature_1, feature_2 in list(combinations(features_ctr, 2)):
        add_pair_feature_ctr(feature_1, feature_2, df_test_normed, test_features)
    return test_features
test_features = get_test_features()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features['target_item_taxonomy_upper'] = test_features['target_item_taxonomy'].str.split('~').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features['target_item_ctr'] = test_features['empiric_clicks'].divide(test_features['empiric_calibrated_recs'], fill_value=0)


In [25]:
test_features = test_features[[
    # 'user_id_hash',
    # 'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'empiric_calibrated_recs',
    'empiric_clicks',
    'target_item_taxonomy',
    'placement_id_hash',
    'user_recs',
    'user_clicks',
    'user_target_recs',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'os_family',
    'country_code',
    'region',
    'day_of_week',
    'time_of_day',
    'gmt_offset',
    'target_item_taxonomy_upper',
    'target_item_ctr',
    'user_ctr',
    'item_presented_to_user_frequency',
    'item_be_clicked_by_user',
    'local_day_part',
    'user_type',
    'campaign_ctr',
    'syndicator_ctr',
    'placement_ctr',
    'publisher_ctr',
    'source_ctr',
    'campaign_syndicator_ctr',
    'campaign_placement_ctr',
    'campaign_publisher_ctr',
    'campaign_source_ctr',
    'syndicator_placement_ctr',
    'syndicator_publisher_ctr',
    'syndicator_source_ctr',
    'placement_publisher_ctr',
    'placement_source_ctr',
    'publisher_source_ctr',
]].to_numpy()

 Running the catboost model on the newly extracted test set features

In [26]:
preds_proba = model.predict_proba(test_features)

In [27]:
def report_submission():
    pred = preds_proba[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('5_days_freq_norm.csv',index=False)
report_submission()

In [5]:
# Saving processed data for easy retrival
data = {'X_train' : last_x_days_features,
        'y_train' : last_x_days_ground_truth,
        'X_test' : test_features }
with open('optuna_5_days_X-train_Y-train_X-Test_dict.pickle', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)