In [1]:
from functools import partial
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import copy
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")
clickrate_folder = os.path.join(config.DATA_DIR, "clickrate_bs/simple_cross/byUserFeatureName")


def click_history_fold_dir(mode, num_folds):
    folder = "{}[{}_{}]".format(clickhist_folder, mode, num_folds)
    return folder


def click_rate_fold_dir(mode, num_folds):
    folder = "{}[{}_{}]".format(clickrate_folder, mode, num_folds)
    return folder


def click_rate_path(mode, num_folds, fold_index, ad_feat_name, user_feat_name):
    folder = click_rate_fold_dir(mode, num_folds)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name))
    clickrate_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
    clickrate_filepath = os.path.join(folder, clickrate_file)
    return clickrate_filepath


def click_rate_param_path(mode, num_folds, fold_index, ad_feat_name, user_feat_name):
    folder = click_rate_fold_dir(mode, num_folds)
    folder = os.path.join(folder, str(fold_index),  "[featureName='{}']".format(user_feat_name))
    param_file = "params[adFeatureName='{}'].csv".format(ad_feat_name)
    param_filepath = os.path.join(folder, param_file)
    return param_filepath


def load_split_indices(mode, num_folds):
    fold_dir = click_history_fold_dir(mode, num_folds=n_splits)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices


def load_clickrate(mode, num_folds, fold_index, ad_feat_name, user_feat_name):
    in_path = click_rate_path(mode, num_folds, fold_index, ad_feat_name, user_feat_name)
    df_clickrate = pd.read_csv(in_path)
    return df_clickrate


def batch_load_clickrate(num_folds, ad_feat_name, user_feat_name):
    quick_load = partial(load_clickrate, mode="StratifiedKFold", num_folds=num_folds, 
                         ad_feat_name=ad_feat_name, user_feat_name=user_feat_name)
    df_clickrate = None
    
    for i in range(n_splits):
        df_new = quick_load(fold_index=i)
        df_new["fold"] = i
        if df_clickrate is None:
            df_clickrate = df_new
        else:
            df_clickrate = pd.concat([df_clickrate, df_new], ignore_index=True)
        del df_new
        gc.collect()
        
    df_clickrate["fold"] = df_clickrate["fold"].astype(int)
    return df_clickrate


def insert_valid_fold_index(df, fold_indices):
    df = df.copy()
    df["fold"] = -1
    for i, (train_index, valid_index) in enumerate(fold_indices):
        df.loc[valid_index, "fold"] = i
    return df

In [3]:
mode = "StratifiedKFold"
n_splits = 5
split_indices = load_split_indices(mode, n_splits)

pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS"), ('productId', 'LBS')]
required_user_features = set([ufeat for afeat, ufeat in pairs])
required_ad_features = set([afeat for afeat, ufeat in pairs])

In [4]:
train = du.load_raw_data("train")
df_train = train.copy()
df_train = insert_valid_fold_index(df_train, split_indices)

aj = dj.PandasPandasJointer("aid")
df_ad = du.load_raw_data("ad")
df_train = aj.join(df_train, df_ad)

uj = dj.PandasPandasJointer("uid")
for user_feat_name in required_user_features:
    with pu.profiler("loading and joining '{}'".format(user_feat_name)):
        df_feat = du.load_user_feature(user_feat_name)
        df_feat[user_feat_name] = df_feat[user_feat_name].fillna("[nan]").astype(str)
        df_train = uj.join(df_train, df_feat)
        del df_feat
        gc.collect()
        
col_names = []
for ad_feat_name, user_feat_name in pairs:
    with pu.profiler("joining click data for '{}' x '{}' by fold".format(ad_feat_name, user_feat_name)):
        # preparation
        ckr_name = "bsClickrate@{}_x_{}".format(ad_feat_name, user_feat_name)
        imp_name = "impression@{}_x_{}".format(ad_feat_name, user_feat_name)
        col_names += [ckr_name, imp_name]
        
        # load data and rename columns
        df_clickrate = batch_load_clickrate(n_splits, ad_feat_name, user_feat_name)
        df_clickrate = df_clickrate.rename(columns={"ad_val": ad_feat_name, 
                                                    "user_val": user_feat_name, 
                                                    "bs_clickrate": ckr_name, 
                                                    "impression": imp_name})
        df_clickrate[user_feat_name] = df_clickrate[user_feat_name].astype(str)
    
        df_train = dj.PandasPandasJointer.quick_join(df_train, df_clickrate, 
                                                     on=["fold", ad_feat_name, user_feat_name])

        df_train.drop("click", axis=1, inplace=True)
        del df_clickrate
        
        assert df_train.isnull().sum().sum() == 0
        gc.collect()
        
output_folder = config.INPUT_DIR
output_file = "train.cross.clickStats_v1.pkl"
output_path = os.path.join(output_folder, output_file)
os.makedirs(output_folder, exist_ok=True)

with pu.profiler("getting train matrix"):
    X_train = df_train[col_names].values.astype(np.float32)
    assert X_train.shape[0] == train.shape[0]
    assert X_train.shape[1] == 2 * len(pairs)

with pu.profiler("saving train matrix"):
    du.save_pickle((col_names, X_train), output_path)
    col_names_train = copy.copy(col_names)  # for subsequent checking
    del X_train
    del df_train
    gc.collect()

[05:08:19] Finish loading and joining 'education'. △M: -514.13MB. △T: 12.7 seconds.
[05:08:32] Finish loading and joining 'LBS'. △M: +68.3MB. △T: 13.3 seconds.
[05:08:45] Finish loading and joining 'consumptionAbility'. △M: +67.14MB. △T: 13.1 seconds.
[05:08:59] Finish loading and joining 'age'. △M: +67.13MB. △T: 13.5 seconds.
[05:09:11] Finish joining click data for 'aid' x 'age' by fold. △M: +134.63MB. △T: 11.7 seconds.
[05:09:24] Finish joining click data for 'aid' x 'education' by fold. △M: +134.37MB. △T: 12.9 seconds.
[05:09:37] Finish joining click data for 'aid' x 'consumptionAbility' by fold. △M: +134.27MB. △T: 13.7 seconds.
[05:09:53] Finish joining click data for 'aid' x 'LBS' by fold. △M: +127.01MB. △T: 16.2 seconds.
[05:10:09] Finish joining click data for 'productId' x 'LBS' by fold. △M: +147.11MB. △T: 16.0 seconds.
[05:10:11] Finish getting train matrix. △M: +335.66MB. △T: 1.6 seconds.
[05:10:11] Finish saving train matrix. △M: -2.03GB. △T: 0.5 seconds.


In [None]:
# load data
test = du.load_raw_data("test2")
df_test = test.copy()

# join ad features
df_test = aj.join(df_test, df_ad)

# join required ad user features
for user_feat_name in required_user_features:
    with pu.profiler("loading and joining '{}'".format(user_feat_name)):
        df_feat = du.load_user_feature(user_feat_name)
        df_feat[user_feat_name] = df_feat[user_feat_name].fillna("[nan]").astype(str)
        df_test = uj.join(df_test, df_feat)
        del df_feat
        gc.collect()

col_names = []
for ad_feat_name, user_feat_name in pairs:
    with pu.profiler("joining click data for '{}' x '{}' by fold".format(ad_feat_name, user_feat_name)):
        # preparation
        ckr_name = "bsClickrate@{}_x_{}".format(ad_feat_name, user_feat_name)
        imp_name = "impression@{}_x_{}".format(ad_feat_name, user_feat_name)
        col_names += [ckr_name, imp_name]
        ckrs = np.zeros((df_test.shape[0], n_splits))
        imps = np.zeros((df_test.shape[0], n_splits))
        quick_load = partial(load_clickrate, mode="StratifiedKFold", num_folds=n_splits, 
                             ad_feat_name=ad_feat_name, user_feat_name=user_feat_name)
        
        for split_index in range(n_splits):
            # load click stats computed from current split
            df_clickrate = quick_load(fold_index=split_index)
            df_clickrate = df_clickrate.rename(columns={"ad_val": ad_feat_name, 
                                                        "user_val": user_feat_name, 
                                                        "bs_clickrate": ckr_name, 
                                                        "impression": imp_name})
            df_clickrate[user_feat_name] = df_clickrate[user_feat_name].astype(str)
            
            # join data
            df_test = dj.PandasPandasJointer.quick_join(df_test, df_clickrate, 
                                                         on=[ad_feat_name, user_feat_name])
            ckrs[:, split_index] = df_test[ckr_name]
            imps[:, split_index] = df_test[imp_name]
            
            # clean up
            df_test.drop([ckr_name, imp_name, "click"], axis=1, inplace=True)
            del df_clickrate
            gc.collect()
        
        # use average as the final feature
        df_test[ckr_name] = ckrs.mean(axis=1)
        df_test[imp_name] = imps.mean(axis=1)
        if df_test.isnull().sum().sum() > 0:
            nan_indices = df_test[df_test.isnull().sum(axis=1) > 0].index
            for nan_index in nan_indices:
                row = df_test.loc[nan_index]
                aval = row[ad_feat_name]
                imp = 0
                ckr = 0
                for split_index in range(n_splits):
                    df_params = pd.read_csv(click_rate_param_path(mode, n_splits, split_index, 
                                                                  ad_feat_name, user_feat_name))
                    ckr += df_params[df_params['ad_val'] == aval]['clickrate_expectation'].values[0]
                ckr /= n_splits
                df_test.loc[nan_index, imp_name] = imp
                df_test.loc[nan_index, ckr_name] = ckr
        assert df_test.isnull().sum().sum() == 0
        
        # clean up
        del ckrs
        del imps
        gc.collect()
        
assert len(col_names) == len(col_names_train)
for i in range(len(col_names)):
    assert col_names[i] == col_names_train[i]
    
output_file = "test2.cross.clickStats_v1.pkl"
output_path = os.path.join(output_folder, output_file)

with pu.profiler("getting test matrix"):
    X_test = df_test[col_names].values.astype(np.float32)
    assert X_test.shape[0] == test.shape[0]
    assert X_test.shape[1] == 2 * len(pairs)

with pu.profiler("saving test matrix"):
    du.save_pickle((col_names, X_test), output_path)
    del X_test
    del df_test
    gc.collect()

[05:29:08] Finish loading and joining 'education'. △M: +61.42MB. △T: 8.4 seconds.
[05:29:17] Finish loading and joining 'LBS'. △M: -34.58MB. △T: 8.5 seconds.
[05:29:25] Finish loading and joining 'consumptionAbility'. △M: +34.58MB. △T: 8.2 seconds.
[05:29:33] Finish loading and joining 'age'. △M: -9.55MB. △T: 8.1 seconds.
[05:29:41] Finish joining click data for 'aid' x 'age' by fold. △M: -103.6MB. △T: 8.2 seconds.
[05:29:50] Finish joining click data for 'aid' x 'education' by fold. △M: +51.74MB. △T: 8.8 seconds.
