In [1]:
from sklearn.model_selection import StratifiedKFold
from functools import partial
import pandas as pd
import numpy as np
import operator
import pickle
import copy
import tqdm
import time
import os
import gc
import sys
sys.path.append("../code/utils")
sys.path.append("../code/feature")
sys.path.append("../code/pipeline")
sys.path.append("../code")
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config
from clickrate import BayesianSmoothedClickrate

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/single_feature/byFeatureName")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, feat_name, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index))
    filename = "[featureName='{}'].csv".format(feat_name)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath

In [3]:
def get_statistics(df, feat_names):
    """Get grouped clicks statistics for given feature.
    
    Parameters
    ----------
    df: pd.DataFrame
        DataFrame you want to get clicked/unclicked statistics from.

    feat_names: list
        List containing names of features to consider.
        
    Returns
    -------
    df_group: pd.DataFrame
        feat_0 | feat_1 | ... | positive | negative 
        ———————+————————+—————+——————————+——————————
        val_0_0| val_1_0| ... | x        | x        
        val_0_0| val_1_0| ... | x        | x        
        ...    |        |     |          |          
        val_0_i| val_1_j| ... | x        | x        
    """
    # do grouping
    if isinstance(feat_names, str):
        feat_names = [feat_names]
    
    group_cols = feat_names + ["label"]
    df_group = df.groupby(group_cols).size()
    df_group = df_group.reset_index()
    df_group = df_group.pivot_table(index=feat_names, columns="label", values=0).reset_index()

    # renaming and resetting
    df_group = df_group.rename(columns={-1: "negative",
                                        1: "positive"})  # rename columns for consistency
    df_group.fillna(0, inplace=True)
    df_group[["positive", "negative"]] = df_group[["positive", "negative"]].astype(int)  # reset type
    df_group = df_group.rename_axis(None, axis=1)  # remove index name, which is very annoying
    return df_group

In [4]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [5]:
y = df_train['label'].values
# y = (y + 1) / 2

In [6]:
user_feat_names = config.USER_FEAT_NAMES
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES.copy()
ad_feat_names.remove("creativeId")

In [7]:
train = pd.merge(df_train, df_ad, on='aid', how='left')
feat_unique_vals = {feat_name: df_ad[feat_name].unique() for feat_name in ad_feat_names}

# not applicable for multi-value cases
for feat_name in tqdm.tqdm(user_one_feat_names):
    df_feat = du.load_user_feature(feat_name).fillna('[nan]')
    train = pd.merge(train, df_feat, on="uid", how="left")
    feat_unique_vals[feat_name] = df_feat[feat_name].unique()

100%|██████████| 7/7 [01:09<00:00,  9.86s/it]


In [8]:
assert train.shape[0] == df_train.shape[0]
assert train.isnull().sum().sum() == 0

In [9]:
# test to (test+train) ratio: 0.204793
# so we will use 5 fold splitting to calculate corrupted clickrate
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, random_state=20180502)  # fix random_state for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

# save train/valid indices for each fold
fold_dir = click_history_fold_dir(num_folds=n_splits)
index_file = "indices.pkl"
index_path = os.path.join(fold_dir, index_file)
du.save_pickle(split_indices, index_path)

In [10]:
for feat_name in ad_feat_names + user_one_feat_names:
    unique_vals = feat_unique_vals[feat_name]
    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a fold ###
        print("[{}] counting '{}' fold {}/{} ...".format(pu.get_time_str(), 
                                                         feat_name, 
                                                         i + 1,
                                                         n_splits))
        df_group = get_statistics(train.iloc[train_index], feat_name)
        
        # fill 0 to missed options
        vals_present = set(df_group[feat_name].unique())
        vals_absent = set(unique_vals).difference(vals_present)
        for val in vals_absent:
            df_group.loc[df_group.shape[0]] = {feat_name: val, "positive": 0, "negative": 0}
            
        # reset column name
        df_group.rename(columns={feat_name: "value"}, inplace=True)

        # save to hard disk
        assert df_group.shape[0] == len(unique_vals)
        out_path = click_history_path(num_folds=n_splits, fold_index=i, feat_name=feat_name)
        df_group.to_csv(out_path, index=False)

[08:55:56] counting 'aid' fold 1/5 ...
[08:55:58] counting 'aid' fold 2/5 ...
[08:56:00] counting 'aid' fold 3/5 ...
[08:56:02] counting 'aid' fold 4/5 ...
[08:56:03] counting 'aid' fold 5/5 ...
[08:56:05] counting 'advertiserId' fold 1/5 ...
[08:56:07] counting 'advertiserId' fold 2/5 ...
[08:56:09] counting 'advertiserId' fold 3/5 ...
[08:56:10] counting 'advertiserId' fold 4/5 ...
[08:56:12] counting 'advertiserId' fold 5/5 ...
[08:56:14] counting 'campaignId' fold 1/5 ...
[08:56:16] counting 'campaignId' fold 2/5 ...
[08:56:18] counting 'campaignId' fold 3/5 ...
[08:56:20] counting 'campaignId' fold 4/5 ...
[08:56:21] counting 'campaignId' fold 5/5 ...
[08:56:23] counting 'creativeSize' fold 1/5 ...
[08:56:25] counting 'creativeSize' fold 2/5 ...
[08:56:27] counting 'creativeSize' fold 3/5 ...
[08:56:28] counting 'creativeSize' fold 4/5 ...
[08:56:30] counting 'creativeSize' fold 5/5 ...
[08:56:32] counting 'adCategoryId' fold 1/5 ...
[08:56:34] counting 'adCategoryId' fold 2/5 ...

In [14]:
clickrate_folder = os.path.join(config.DATA_DIR, "clickrate_bs/single_feature/byFeatureName")


def click_rate_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickrate_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_rate_path(num_folds, fold_index, feat_name, create=True):
    folder = click_rate_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index))
   
    ckr_filename = "[featureName='{}'].csv".format(feat_name)
    ckr_filepath = os.path.join(folder, ckr_filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return ckr_filepath


def click_rate_meta_path(num_folds, fold_index, create=True):
    folder = click_rate_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index))
    meta_file = "params.csv"
    meta_filepath = os.path.join(folder, meta_file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return meta_filepath

In [17]:
def clean_feat_stats(feat_stats):
    """Preprocess clicks history data just loaded from hard disk.
    
    Parameters
    ----------
    feat_stats: pd.DataFrame
        DataFrame containing grouped clicks history data, with columns:
        'value', 'negative', 'positive'.
        
    Returns
    -------
    feat_stats: pd.DataFrame
        DataFrame containing grouped clicks history data, with columns:
        'value', 'negative', 'click', 'impression'.
    """
    feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
    feat_stats = feat_stats.rename(columns={"positive": "click"})
    return feat_stats


def clean_meta(df_meta):
    df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
    return df_meta[["featureName", "alpha", "beta", "clickrate_expectation"]]


def clean_clickrate(df_clickrate):
    df_clickrate[["click", "impression"]] = df_clickrate[["click", "impression"]].astype(int)
    df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
    return df_clickrate[["value", "bs_clickrate", "click", "impression"]]

In [18]:
for split_i in range(n_splits):
    ### given a fold ###
    df_meta = pd.DataFrame(columns=["featureName", "alpha", "beta", "clickrate_expectation"])
    
    for feat_name in ad_feat_names + user_one_feat_names:
        ### given a feature ###
        # preparation
        print("[{}] processing '{}' fold {}/{}...".format(pu.get_time_str(), feat_name, split_i + 1, n_splits))
        df_clickrate = pd.DataFrame(columns=["value", "bs_clickrate", "click", "impression"])
        in_path = click_history_path(num_folds=n_splits, fold_index=split_i, feat_name=feat_name)
        feat_stats = pd.read_csv(in_path)
        feat_stats = clean_feat_stats(feat_stats)

        # solve bayesian smoothed click rate
        imps = feat_stats["impression"].values
        clks = feat_stats["click"].values
        bs = BayesianSmoothedClickrate(max_iter=10000)
        bs.fit(imps, clks, verbose=False)

        # update meta and click rate DataFrame
        feat_stats["bs_clickrate"] = bs.transform(imps, clks)
        df_meta.loc[df_meta.shape[0]] = {"featureName": feat_name, "alpha": bs.alpha, "beta": bs.beta, 
                                         "clickrate_expectation": bs.clickrate_expectation}
        df_ckr= feat_stats[["value", "bs_clickrate", "click", "impression"]]
        
        # save click rates to hard disk
        ckr_path = click_rate_path(n_splits, split_i, feat_name)
        df_ckr = clean_clickrate(df_ckr)
        df_ckr.to_csv(ckr_path, index=False)
    
    # save meta info to hard disk
    meta_path = click_rate_meta_path(n_splits, split_i)
    df_meta = clean_meta(df_meta)
    df_meta.to_csv(meta_path, index=False)

[09:01:10] processing 'aid' fold 1/5...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


[09:01:10] processing 'advertiserId' fold 1/5...
[09:01:11] processing 'campaignId' fold 1/5...
[09:01:11] processing 'creativeSize' fold 1/5...
[09:01:12] processing 'adCategoryId' fold 1/5...
[09:01:12] processing 'productId' fold 1/5...
[09:01:12] processing 'productType' fold 1/5...
[09:01:13] processing 'age' fold 1/5...
[09:01:13] processing 'gender' fold 1/5...
[09:01:14] processing 'education' fold 1/5...
[09:01:14] processing 'consumptionAbility' fold 1/5...
[09:01:14] processing 'LBS' fold 1/5...
[09:01:17] processing 'carrier' fold 1/5...
[09:01:17] processing 'house' fold 1/5...
[09:01:18] processing 'aid' fold 2/5...
[09:01:18] processing 'advertiserId' fold 2/5...
[09:01:19] processing 'campaignId' fold 2/5...
[09:01:19] processing 'creativeSize' fold 2/5...
[09:01:20] processing 'adCategoryId' fold 2/5...
[09:01:20] processing 'productId' fold 2/5...
[09:01:21] processing 'productType' fold 2/5...
[09:01:21] processing 'age' fold 2/5...
[09:01:21] processing 'gender' fol

In [24]:
def load_split_indices(num_folds):
    fold_dir = click_history_fold_dir(num_folds=n_splits)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices


def load_clickrate(num_folds, fold_index, feat_name):
    in_path = click_rate_path(num_folds, fold_index, feat_name)
    df_ckr = pd.read_csv(in_path)
    return df_ckr


def batch_load_clickrate(num_folds, feat_name):
    quick_load = partial(load_clickrate, num_folds=num_folds, feat_name=feat_name)
    df_ckr = None
    
    for i in range(n_splits):
        df_new = quick_load(fold_index=i)
        df_new["fold"] = i
        if df_ckr is None:
            df_ckr = df_new
        else:
            df_ckr = pd.concat([df_ckr, df_new], ignore_index=True)
        del df_new
        gc.collect()
        
    df_ckr["fold"] = df_ckr["fold"].astype(int)
    return df_ckr

In [20]:
def insert_valid_fold_index(df, fold_indices):
    df = df.copy()
    df["fold"] = -1
    for i, (train_index, valid_index) in enumerate(fold_indices):
        df.loc[valid_index, "fold"] = i
    return df

In [21]:
# n_splits = 5
# split_indices = load_split_indices(mode, n_splits)

In [37]:
use_feat_names = ["aid", "campaignId", "age", "consumptionAbility", "education", "LBS"]
col_names = []
train2 = train.copy()
train2 = insert_valid_fold_index(train2, split_indices)

for feat_name in use_feat_names:
    with pu.profiler("joining click data for '{}' by fold".format(feat_name)):
        # preparation
        ckr_name = "bsClickrate@{}".format(feat_name)
        col_names.append(ckr_name)
        
        # load data and rename columns
        df_ckr = batch_load_clickrate(n_splits, feat_name)
        df_ckr = df_ckr.rename(columns={"value": feat_name, "bs_clickrate": ckr_name})
        
        # unify dtype for joining
        df_ckr[feat_name] = df_ckr[feat_name].astype(str)
        train2[feat_name] = train2[feat_name].apply(str)
    
        # join and drop redundant data
        train2 = dj.PandasPandasJointer.quick_join(train2, df_ckr, on=["fold", feat_name])
        train2.drop(["impression", "click"], axis=1, inplace=True)
        assert train2.isnull().sum().sum() == 0
        
        # release memory and collect garbage
        del df_ckr
        gc.collect()

[10:50:35] Finish joining click data for 'aid' by fold. △M: -2.2GB. △T: 16.6 seconds.
[10:50:53] Finish joining click data for 'campaignId' by fold. △M: +546.48MB. △T: 17.9 seconds.
[10:51:09] Finish joining click data for 'age' by fold. △M: +67.13MB. △T: 15.9 seconds.
[10:51:25] Finish joining click data for 'consumptionAbility' by fold. △M: +67.13MB. △T: 16.4 seconds.
[10:51:42] Finish joining click data for 'education' by fold. △M: +67.13MB. △T: 16.5 seconds.
[10:51:59] Finish joining click data for 'LBS' by fold. △M: +67.16MB. △T: 17.6 seconds.


In [43]:
output_folder = config.INPUT_DIR
output_file = "train.raw.clickStats_v1.pkl"
output_path = os.path.join(output_folder, output_file)
os.makedirs(output_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = train2[col_names].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_feat_names)

with pu.profiler("saving matrix to hard disk"):
    du.save_pickle((col_names, X_train), output_path)
    col_names_train = copy.copy(col_names)  # for subsequent checking
    del X_train
    del train2
    gc.collect()

[10:54:46] Finish getting matrix represenation. △M: +201.39MB. △T: 0.5 seconds.
[10:54:46] Finish saving matrix to hard disk. △M: -201.39MB. △T: 0.6 seconds.


In [48]:
test = pd.merge(df_test, df_ad, on='aid', how='left')

# not applicable for multi-value cases
for feat_name in tqdm.tqdm(user_one_feat_names):
    df_feat = du.load_user_feature(feat_name).fillna('[nan]')
    test = pd.merge(test, df_feat, on="uid", how="left")

100%|██████████| 7/7 [00:37<00:00,  5.31s/it]


In [49]:
col_names = []

for feat_name in use_feat_names:
    with pu.profiler("joining click data for '{}' by fold".format(feat_name)):
        # preparation
        ckr_name = "bsClickrate@{}".format(feat_name)
        col_names.append(ckr_name)
        ckrs = np.zeros((test.shape[0], n_splits))
        quick_load = partial(load_clickrate, num_folds=n_splits, feat_name=feat_name)
        test[feat_name] = test[feat_name].apply(str)  # unify dtype for joining
        
        for split_index in range(n_splits):
            # load click stats computed from current split
            df_ckr = quick_load(fold_index=split_index)
            df_ckr = df_ckr.rename(columns={"value": feat_name, "bs_clickrate": ckr_name})
            
            # unify dtype for joining
            df_ckr[feat_name] = df_ckr[feat_name].astype(str)
            
            # join data
            test = dj.PandasPandasJointer.quick_join(test, df_ckr, on=feat_name)
            ckrs[:, split_index] = test[ckr_name]
            
            # clean up
            test.drop([ckr_name, "impression", "click"], axis=1, inplace=True)
            del df_ckr
            gc.collect()
        
        # use average as the final feature
        test[ckr_name] = ckrs.mean(axis=1)
        assert test.isnull().sum().sum() == 0
        
        # clean up
        del ckrs
        gc.collect()

[11:07:02] Finish joining click data for 'aid' by fold. △M: -257.22MB. △T: 8.9 seconds.
[11:07:14] Finish joining click data for 'campaignId' by fold. △M: +140.3MB. △T: 11.7 seconds.
[11:07:25] Finish joining click data for 'age' by fold. △M: -121.02MB. △T: 11.2 seconds.
[11:07:37] Finish joining click data for 'consumptionAbility' by fold. △M: +69.16MB. △T: 11.6 seconds.
[11:07:48] Finish joining click data for 'education' by fold. △M: +34.57MB. △T: 11.8 seconds.
[11:08:01] Finish joining click data for 'LBS' by fold. △M: +17.32MB. △T: 12.7 seconds.


In [50]:
output_file = "test1.raw.clickStats_v1.pkl"
output_path = os.path.join(output_folder, output_file)

with pu.profiler("getting matrix represenation"):
    X_test = test[col_names].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_feat_names)

with pu.profiler("saving matrix to hard disk"):
    du.save_pickle((col_names, X_test), output_path)
    del X_test
    del test
    gc.collect()

[11:08:24] Finish getting matrix represenation. △M: +0B. △T: 0.0 seconds.
[11:08:24] Finish saving matrix to hard disk. △M: -505.82MB. △T: 0.2 seconds.
