In [1]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import operator
import pickle
import tqdm
import time
import os
import matplotlib.pyplot as plt
import sys
sys.path.append("../code/utils")
sys.path.append("../code/feature")
import data_utils as du
import perf_utils as pu
from clickrate import BayesianSmoothedClickrate

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [3]:
# test to all: 0.204793
# so we will use 5 fold splitting to calculate corrupted clickrate
print("Test/(Train + Test) Ratio: {:.6f}".format(test_size / (train_size + test_size)))

Test/(Train + Test) Ratio: 0.204793


In [4]:
y = df_train['label'].values
# y = (y + 1) / 2

In [5]:
pairs = [('aid', 'age'), ('aid', 'education'), ('aid', 'consumptionAbility'), ('aid', 'LBS')]
ufeats_to_join = set([ufeat for afeat, ufeat in pairs])
afeats_to_join = set([afeat for afeat, ufeat in pairs])

In [6]:
avals_dict = {afeat: set(df_ad[afeat].unique()) for afeat in afeats_to_join}

In [7]:
uvals_dict = {}
ad_user = pd.merge(df_train, df_ad, on='aid', how='left')
for ufeat in tqdm.tqdm(ufeats_to_join):
    df_feat = du.load_user_feature(ufeat).fillna('[nan]')
    ad_user = pd.merge(ad_user, df_feat, on="uid", how="left")
    uvals_dict[ufeat] = set(df_feat[ufeat].unique())

100%|██████████| 4/4 [00:43<00:00, 10.76s/it]


In [44]:
def select_df(df, selector):
    for col, val in selector.items():
        df = df[df[col]==val]
    return df.copy()


def get_statistics(ad_user, ad_feat_name, user_feat_name):
    """
    Arguments
    ---------
    ad_user: pd.DataFrame
        Joined dataframe containing both the target ad feature and user feature.

    ad_feat_name: string
        Name of ad feature to consider.
        
    user_feat_name: string
        Name of user feature to consider.
    """
    # do grouping
    df_group = ad_user.groupby([ad_feat_name, user_feat_name, "label"]).size()
    df_group = df_group.reset_index()
    df_group = df_group.pivot_table(index=[ad_feat_name, user_feat_name], columns="label", values=0).reset_index()

    # renaming and resetting
    df_group = df_group.rename(columns={0: "count",
                                        -1: "negative",
                                        1: "positive",
                                        ad_feat_name: "ad_val",
                                        user_feat_name: "user_val"})  # rename columns for consistency
    df_group.fillna(0, inplace=True)
    df_group[["positive", "negative"]] = df_group[["positive", "negative"]].astype(int)  # reset type
    df_group = df_group.rename_axis(None, axis=1)  # remove index name, which is very annoying
    return df_group


def select_subgroup(df_group, aval, uvals=None, ad_feat_name="ad_val", user_feat_name="user_val"):
    """
    Arguments
    ---------
    df_group: pd.DataFrame
        A pivoted table storing statistics for all ad feature values.

    aval: int | float | string
        Ad feature value whose statistics you want to select from `df_group`.

    uvals: set
        All user feature values presented in the whole dataset.

    ad_feat_name: string
        Name of current ad feature. If not given, the program will assume you
        have renamed the column to 'ad_val'.

    user_feat_name: string
        Name of current user feature. If not given, the program will assume you
        have renamed the column to 'user_val'.
    """
    # select sub-dataframe with given ad feature value
    df_selected = select_df(df_group, {ad_feat_name: aval})  # select statistics for given aid

    # rename and reset columns
    df_selected = df_selected.rename(columns={user_feat_name: "value"})  # rename columns
    df_selected = df_selected[["value", "positive", "negative"]]  # selected wanted columns
    df_selected = df_selected.reset_index(drop=True)

    # handle missing user feature values: append rows with 0 positive and 0 negative
    if uvals is not None:
        uvals_present = set(df_selected["value"].unique())
        uvals_absent = set(uvals).difference(uvals_present)
        for uval in uvals_absent:
            df_selected.loc[df_selected.shape[0]] = {"value": uval, "positive": 0, "negative": 0}

    # handle missing positive and negative: fill with 0
    df_selected = df_selected.fillna(0)
    df_selected[["positive", "negative"]] = df_selected[["positive", "negative"]].astype(int)  # reset type
    return df_selected

In [15]:
clickhist_folder = "../data/click_history/simple_cross/byUserFeatureName"


def click_history_fold_dir(mode, num_folds, create=True):
    folder = "{}[{}_{}]".format(clickhist_folder, mode, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(mode, num_folds, fold_index, ad_feat_name, user_feat_name, aval, create=True):
    folder = click_history_fold_dir(mode, num_folds)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, aval)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath

In [16]:
# split k-folds and keep the indices
mode = "StratifiedKFold"
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, random_state=20180502)  # fix random_state for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

# save train/valid indices for each fold
fold_dir = click_history_fold_dir(mode, num_folds=n_splits)
index_file = "indices.pkl"
index_path = os.path.join(fold_dir, index_file)
du.save_pickle(split_indices, index_path)

In [46]:
for ad_feat_name, user_feat_name in pairs:
    avals = avals_dict[ad_feat_name]
    uvals = uvals_dict[user_feat_name]
    for i, (train_index, valid_index) in enumerate(split_indices):
        print("[{}] counting '{}' x '{}' fold {}/{} ...".format(pu.get_time_str(), 
                                                                ad_feat_name, 
                                                                user_feat_name,
                                                                i + 1, 
                                                                n_splits))
        df_group = get_statistics(ad_user.iloc[train_index], ad_feat_name, user_feat_name)
        for aval in avals:
            out_path = click_history_path(mode=mode, num_folds=n_splits, fold_index=i, 
                                          user_feat_name=user_feat_name, ad_feat_name=ad_feat_name, aval=aval)
            df_subgroup = select_subgroup(df_group, aval, uvals)
            assert df_subgroup.shape[0] == len(uvals)
            df_subgroup.to_csv(out_path, index=False)

[11:00:10] counting 'aid' x 'age' fold 1/5 ...
[11:00:17] counting 'aid' x 'age' fold 2/5 ...
[11:00:25] counting 'aid' x 'age' fold 3/5 ...
[11:00:31] counting 'aid' x 'age' fold 4/5 ...
[11:00:39] counting 'aid' x 'age' fold 5/5 ...
[11:00:45] counting 'aid' x 'education' fold 1/5 ...
[11:00:52] counting 'aid' x 'education' fold 2/5 ...
[11:00:59] counting 'aid' x 'education' fold 3/5 ...
[11:01:06] counting 'aid' x 'education' fold 4/5 ...
[11:01:14] counting 'aid' x 'education' fold 5/5 ...
[11:01:21] counting 'aid' x 'consumptionAbility' fold 1/5 ...
[11:01:28] counting 'aid' x 'consumptionAbility' fold 2/5 ...
[11:01:37] counting 'aid' x 'consumptionAbility' fold 3/5 ...
[11:01:42] counting 'aid' x 'consumptionAbility' fold 4/5 ...
[11:01:49] counting 'aid' x 'consumptionAbility' fold 5/5 ...
[11:01:55] counting 'aid' x 'LBS' fold 1/5 ...
[11:06:04] counting 'aid' x 'LBS' fold 2/5 ...
[11:10:03] counting 'aid' x 'LBS' fold 3/5 ...
[11:14:02] counting 'aid' x 'LBS' fold 4/5 ...
[1

In [49]:
clickrate_folder = "../data/clickrate_bs/simple_cross/byUserFeatureName"


def click_rate_fold_dir(mode, num_folds, create=True):
    folder = "{}[{}_{}]".format(clickrate_folder, mode, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_rate_paths(mode, num_folds, fold_index, ad_feat_name, user_feat_name, create=True):
    folder = click_rate_fold_dir(mode, num_folds, create)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name))
    clickrate_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
    clickrate_filepath = os.path.join(folder, clickrate_file)
    meta_file = "params[adFeatureName='{}'].csv".format(ad_feat_name)
    meta_filepath = os.path.join(folder, meta_file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return clickrate_filepath, meta_filepath

In [50]:
def clean_feat_stats(feat_stats):
    # feat_stats = feat_stats[feat_stats["value"] != "all"]
    feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
    feat_stats = feat_stats.rename(columns={"positive": "click", "value":"user_val"})
    return feat_stats


def clean_meta(df_meta):
    df_meta["ad_val"] = df_meta["ad_val"].astype(int)  # seems that this line of code is redundant
    df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
    return df_meta[["ad_val", "alpha", "beta", "clickrate_expectation"]]


def clean_clickrate(df_clickrate):
    df_clickrate[["click", "impression"]] = df_clickrate[["click", "impression"]].astype(int)
    df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
    return df_clickrate[["ad_val", "user_val", "bs_clickrate", "click", "impression"]]

In [None]:
for ad_feat_name, user_feat_name in pairs:
    avals = avals_dict[ad_feat_name]
    for split_i in range(n_splits):
        df_meta = pd.DataFrame(columns=["ad_val", "alpha", "beta", "clickrate_expectation"])
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"])
        
        desc = "'{}' x '{}' fold {}/{}".format(ad_feat_name, user_feat_name, split_i + 1, n_splits)
        for aval in tqdm.tqdm(list(avals), desc=desc):
            in_path = click_history_path(mode=mode, num_folds=n_splits, fold_index=split_i,
                                         user_feat_name=user_feat_name, ad_feat_name=ad_feat_name, aval=aval)
            feat_stats = pd.read_csv(in_path)
            feat_stats = clean_feat_stats(feat_stats)
            
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            bs = BayesianSmoothedClickrate(max_iter=10000)
            bs.fit(imps, clks, verbose=False)
            
            feat_stats["bs_clickrate"] = bs.transform(imps, clks)
            feat_stats["ad_val"] = aval
            df_meta.loc[df_meta.shape[0]] = {"ad_val": aval, "alpha": bs.alpha, "beta": bs.beta, 
                                             "clickrate_expectation": bs.clickrate_expectation}
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", "bs_clickrate", "click", "impression"]])
        
        clickrate_path, meta_path = click_rate_paths(mode, n_splits, split_i, ad_feat_name, user_feat_name)
        df_meta = clean_meta(df_meta)
        df_meta.to_csv(meta_path, index=False)
        df_clickrate = clean_clickrate(df_clickrate)
        df_clickrate.to_csv(clickrate_path, index=False)

'aid' x 'age' fold 1/5: 100%|██████████| 173/173 [01:17<00:00,  2.23it/s]
'aid' x 'age' fold 2/5: 100%|██████████| 173/173 [01:19<00:00,  2.18it/s]
'aid' x 'age' fold 3/5: 100%|██████████| 173/173 [01:21<00:00,  2.12it/s]
'aid' x 'age' fold 4/5: 100%|██████████| 173/173 [01:20<00:00,  2.14it/s]
'aid' x 'age' fold 5/5: 100%|██████████| 173/173 [01:23<00:00,  2.08it/s]
'aid' x 'education' fold 1/5: 100%|██████████| 173/173 [01:18<00:00,  2.21it/s]
'aid' x 'education' fold 2/5: 100%|██████████| 173/173 [01:20<00:00,  2.14it/s]
'aid' x 'education' fold 3/5: 100%|██████████| 173/173 [01:18<00:00,  2.21it/s]
'aid' x 'education' fold 4/5: 100%|██████████| 173/173 [01:16<00:00,  2.27it/s]
'aid' x 'education' fold 5/5: 100%|██████████| 173/173 [01:15<00:00,  2.28it/s]
'aid' x 'consumptionAbility' fold 1/5: 100%|██████████| 173/173 [01:14<00:00,  2.31it/s]
'aid' x 'consumptionAbility' fold 2/5: 100%|██████████| 173/173 [01:10<00:00,  2.46it/s]
'aid' x 'consumptionAbility' fold 3/5: 100%|████████