In [1]:
from functools import partial
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import copy
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/feature')
sys.path.append('../code/pipeline')
sys.path.append('../code')
from clickrate import BayesianSmoothedClickrate
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")
clickrate_folder = os.path.join(config.DATA_DIR, "clickrate_bs/simple_cross/byUserFeatureName")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, ad_feat_name, user_feat_name, ad_val, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index), "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, ad_val)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath


def click_rate_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickrate_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_rate_paths(num_folds, fold_index, ad_feat_name, user_feat_name, create=True):
    folder = click_rate_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name))
    clickrate_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
    clickrate_filepath = os.path.join(folder, clickrate_file)
    meta_file = "params[adFeatureName='{}'].csv".format(ad_feat_name)
    meta_filepath = os.path.join(folder, meta_file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return clickrate_filepath, meta_filepath


def load_split_indices(num_folds):
    fold_dir = click_history_fold_dir(num_folds=num_folds, create=False)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices

In [3]:
df_ad = du.load_raw_data("ad")

In [4]:
n_splits = 5
split_indices = load_split_indices(n_splits)

In [5]:
pairs = [('productId', 'LBS'),
         ('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         # ('productType', 'topic3'),  # might help in predicting negative samples
         # ('productType', 'appIdInstall'),  # might help in predicting negative samples
         # ('productType', 'appIdAction'),  # might help in predicting negative samples
         ('aid', 'ct'),
         ('aid', 'os')]

In [6]:
ufeats_to_join = set([ufeat for afeat, ufeat in pairs])
afeats_to_join = set([afeat for afeat, ufeat in pairs])
avals_dict = {afeat: set(df_ad[afeat].unique()) for afeat in afeats_to_join}

In [7]:
def clean_feat_stats(feat_stats):
    feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
    feat_stats = feat_stats.rename(columns={"positive": "click", "value":"user_val"})
    return feat_stats


def clean_meta(df_meta):
    df_meta["ad_val"] = df_meta["ad_val"].astype(int)  # seems that this line of code is redundant
    df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
    return df_meta[["ad_val", "alpha", "beta", "clickrate_expectation"]]


def clean_clickrate(df_clickrate):
    df_clickrate[["click", "impression"]] = df_clickrate[["click", "impression"]].astype(int)
    df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
    return df_clickrate[["ad_val", "user_val", "bs_clickrate", "click", "impression"]]

In [11]:
for ad_feat_name, user_feat_name in pairs:
    avals = avals_dict[ad_feat_name]
    for split_i in range(n_splits):
        df_meta = pd.DataFrame(columns=["ad_val", "alpha", "beta", "clickrate_expectation"])
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"])
        
        desc = "'{}' x '{}' fold {}/{}".format(ad_feat_name, user_feat_name, split_i + 1, n_splits)
        for aval in tqdm.tqdm(list(avals), desc=desc):
            in_path = click_history_path(num_folds=n_splits, fold_index=split_i,
                                         user_feat_name=user_feat_name, ad_feat_name=ad_feat_name, 
                                         ad_val=aval)
            feat_stats = pd.read_csv(in_path)
            feat_stats = clean_feat_stats(feat_stats)
            
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            bs = BayesianSmoothedClickrate(use_moment=True, use_fixed_point=False)
            bs.fit(imps, clks, verbose=False)
            
            feat_stats["bs_clickrate"] = bs.transform(imps, clks)
            feat_stats["ad_val"] = aval
            df_meta.loc[df_meta.shape[0]] = {"ad_val": aval, "alpha": bs.alpha, "beta": bs.beta, 
                                             "clickrate_expectation": bs.clickrate_expectation}
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", "bs_clickrate", "click", "impression"]])
        
        clickrate_path, meta_path = click_rate_paths(n_splits, split_i, ad_feat_name, user_feat_name)
        df_meta = clean_meta(df_meta)
        df_meta.to_csv(meta_path, index=False)
        df_clickrate = clean_clickrate(df_clickrate)
        df_clickrate.to_csv(clickrate_path, index=False)

'productId' x 'LBS' fold 1/5: 100%|██████████| 33/33 [00:00<00:00, 49.01it/s]
'productId' x 'LBS' fold 2/5: 100%|██████████| 33/33 [00:00<00:00, 55.69it/s]
'productId' x 'LBS' fold 3/5: 100%|██████████| 33/33 [00:00<00:00, 57.11it/s]
'productId' x 'LBS' fold 4/5: 100%|██████████| 33/33 [00:00<00:00, 52.07it/s]
'productId' x 'LBS' fold 5/5: 100%|██████████| 33/33 [00:00<00:00, 83.31it/s]
'advertiserId' x 'interest1' fold 1/5: 100%|██████████| 79/79 [00:00<00:00, 81.71it/s]
'advertiserId' x 'interest1' fold 2/5: 100%|██████████| 79/79 [00:00<00:00, 97.87it/s]
'advertiserId' x 'interest1' fold 3/5: 100%|██████████| 79/79 [00:00<00:00, 108.62it/s]
'advertiserId' x 'interest1' fold 4/5: 100%|██████████| 79/79 [00:00<00:00, 112.37it/s]
'advertiserId' x 'interest1' fold 5/5: 100%|██████████| 79/79 [00:00<00:00, 100.81it/s]
'aid' x 'interest2' fold 1/5: 100%|██████████| 173/173 [00:02<00:00, 78.32it/s]
'aid' x 'interest2' fold 2/5: 100%|██████████| 173/173 [00:01<00:00, 86.72it/s]
'aid' x 'int