In [1]:
from functools import partial
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import copy
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/feature')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
from clickrate import BayesianSmoothedClickrate
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/user_cross")
clickrate_folder = os.path.join(config.DATA_DIR, "clickrate_bs/user_cross")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, user_feat_name1, user_feat_name2, user1_val, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index), "[featureName='{}']".format(user_feat_name2))
    filename = "[{}='{}'].csv".format(user_feat_name1, user1_val)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath


def click_rate_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickrate_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_rate_paths(num_folds, fold_index, user_feat_name1, user_feat_name2, create=True):
    folder = click_rate_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name2))
    clickrate_file = "[featureName='{}'].csv".format(user_feat_name1)
    clickrate_filepath = os.path.join(folder, clickrate_file)
    meta_file = "params[featureName='{}'].csv".format(user_feat_name1)
    meta_filepath = os.path.join(folder, meta_file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return clickrate_filepath, meta_filepath


def load_split_indices(num_folds):
    fold_dir = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")
    fold_dir = "{}[StratifiedKFold_{}]".format(fold_dir, num_folds)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices

In [6]:
def clean_feat_stats(feat_stats):
    feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
    feat_stats = feat_stats.rename(columns={"positive": "click", "value":"user_val"})
    return feat_stats


def clean_meta(df_meta):
    df_meta["ad_val"] = df_meta["ad_val"].astype(str)  # seems that this line of code is redundant
    df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
    return df_meta[["ad_val", "alpha", "beta", "clickrate_expectation"]]


def clean_clickrate(df_clickrate):
    df_clickrate[["click", "impression"]] = df_clickrate[["click", "impression"]].astype(str)
    df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
    return df_clickrate[["ad_val", "user_val", "bs_clickrate", "click", "impression"]]

In [4]:
n_splits = 5
split_indices = load_split_indices(n_splits)

pair_dict = {}
for i, user_feat1 in enumerate(config.USER_SINGLE_FEAT_NAMES[:-1]):
    pair_dict[user_feat1] = config.USER_SINGLE_FEAT_NAMES[i + 1:]
    
pairs = []
for user_feat_name2, user_feat_names1 in pair_dict.items():
    for user_feat_name1 in user_feat_names1:
        pairs.append((user_feat_name1, user_feat_name2))
        
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

train = df_train.copy()
user_jointer = dj.PandasPandasJointer("uid")
for ufeat in config.USER_SINGLE_FEAT_NAMES:
    with pu.profiler("loading and joining '{}'".format(ufeat)):
        df_feat = du.load_user_feature(ufeat).fillna("[nan]")
        train = user_jointer.join(df1=train, df2=df_feat)
assert train.isnull().sum().sum() == 0
        
avals_dict = {afeat: set(train[afeat].unique()) for afeat in config.USER_SINGLE_FEAT_NAMES}

[16:51:41] Finish loading and joining 'age'. △M: +330.46MB. △T: 11.7 seconds.
[16:51:53] Finish loading and joining 'gender'. △M: +67.16MB. △T: 12.1 seconds.
[16:52:06] Finish loading and joining 'education'. △M: +67.14MB. △T: 12.9 seconds.
[16:52:23] Finish loading and joining 'consumptionAbility'. △M: +67.13MB. △T: 17.0 seconds.
[16:52:39] Finish loading and joining 'LBS'. △M: +68.37MB. △T: 16.5 seconds.
[16:52:55] Finish loading and joining 'carrier'. △M: +67.14MB. △T: 15.8 seconds.
[16:53:11] Finish loading and joining 'house'. △M: +67.13MB. △T: 16.2 seconds.


In [8]:
for ad_feat_name, user_feat_name in pairs:
    avals = avals_dict[ad_feat_name]
    for split_i in range(n_splits):
        df_meta = pd.DataFrame(columns=["ad_val", "alpha", "beta", "clickrate_expectation"])
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"])
        
        desc = "'{}' x '{}' fold {}/{}".format(ad_feat_name, user_feat_name, split_i + 1, n_splits)
        for aval in tqdm.tqdm(list(avals), desc=desc):
            in_path = click_history_path(num_folds=n_splits, fold_index=split_i,
                                         user_feat_name2=user_feat_name, user_feat_name1=ad_feat_name, 
                                         user1_val=aval)
            feat_stats = pd.read_csv(in_path)
            feat_stats = clean_feat_stats(feat_stats)
            
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            
            max_iter = 10000 if not "LBS" in [ad_feat_name, user_feat_name] else 1000
            bs = BayesianSmoothedClickrate(use_moment=False, use_fixed_point=True, max_iter=max_iter)
            bs.fit(imps, clks, verbose=False)
            
            feat_stats["bs_clickrate"] = bs.transform(imps, clks)
            feat_stats["ad_val"] = aval
            df_meta.loc[df_meta.shape[0]] = {"ad_val": aval, "alpha": bs.alpha, "beta": bs.beta, 
                                             "clickrate_expectation": bs.clickrate_expectation}
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", "bs_clickrate", "click", "impression"]])
        
        clickrate_path, meta_path = click_rate_paths(n_splits, split_i, ad_feat_name, user_feat_name)
        df_meta = clean_meta(df_meta)
        df_meta.to_csv(meta_path, index=False)
        df_clickrate = clean_clickrate(df_clickrate)
        df_clickrate.to_csv(clickrate_path, index=False)

'carrier' x 'LBS' fold 1/5: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]
'carrier' x 'LBS' fold 2/5: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]
'carrier' x 'LBS' fold 3/5: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]
'carrier' x 'LBS' fold 4/5: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]
'carrier' x 'LBS' fold 5/5: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]
'house' x 'LBS' fold 1/5: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
'house' x 'LBS' fold 2/5: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
'house' x 'LBS' fold 3/5: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
'house' x 'LBS' fold 4/5: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
'house' x 'LBS' fold 5/5: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
'education' x 'gender' fold 1/5: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]
'education' x 'gender' fold 2/5: 100%|██████████| 8/8 [00:03<00:00,  2.19it/s]
'education' x 'gender' fold 3/5: 100%|██████████| 8/8 [00:03<00:00,  2.26it/s]
'education' x 'gender' fold 4/5: 

'LBS' x 'age' fold 2/5: 100%|██████████| 840/840 [02:14<00:00,  6.23it/s]
'LBS' x 'age' fold 3/5: 100%|██████████| 840/840 [02:07<00:00,  6.60it/s]
'LBS' x 'age' fold 4/5: 100%|██████████| 840/840 [02:10<00:00,  6.42it/s]
'LBS' x 'age' fold 5/5: 100%|██████████| 840/840 [01:49<00:00,  7.67it/s]
'carrier' x 'age' fold 1/5: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]
'carrier' x 'age' fold 2/5: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]
'carrier' x 'age' fold 3/5: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]
'carrier' x 'age' fold 4/5: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]
'carrier' x 'age' fold 5/5: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]
'house' x 'age' fold 1/5: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s]
'house' x 'age' fold 2/5: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
'house' x 'age' fold 3/5: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
'house' x 'age' fold 4/5: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
'house' x 'age' fold 5/5: 100%|██████████| 2/2