In [1]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import operator
import pickle
import tqdm
import time
import os
import gc
import matplotlib.pyplot as plt
import sys
sys.path.append("../../../code/utils")
sys.path.append("../../../code/feature")
sys.path.append("../../../code/pipeline")
sys.path.append("../../../code")
import data_jointer as dj
import data_utils as du
import perf_utils as pu
from clickrate import BayesianSmoothedClickrate
import history
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/user_cross")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, user_feat_name1, user_feat_name2, user1_val, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index), "[featureName='{}']".format(user_feat_name2))
    filename = "[{}='{}'].csv".format(user_feat_name1, user1_val)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath


def load_split_indices(num_folds):
    fold_dir = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")
    fold_dir = "{}[StratifiedKFold_{}]".format(fold_dir, num_folds)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices

In [3]:
n_splits = 5
split_indices = load_split_indices(n_splits)

pair_dict = {}
for i, user_feat1 in enumerate(config.USER_SINGLE_FEAT_NAMES[:-1]):
    pair_dict[user_feat1] = config.USER_SINGLE_FEAT_NAMES[i + 1:]

df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

train = df_train.copy()
user_jointer = dj.PandasPandasJointer("uid")
for ufeat in config.USER_SINGLE_FEAT_NAMES:
    with pu.profiler("loading and joining '{}'".format(ufeat)):
        df_feat = du.load_user_feature(ufeat).fillna("[nan]")
        train = user_jointer.join(df1=train, df2=df_feat)
assert train.isnull().sum().sum() == 0

[14:53:26] Finish loading and joining 'age'. △M: +330.41MB. △T: 11.0 seconds.
[14:53:37] Finish loading and joining 'gender'. △M: +67.16MB. △T: 10.5 seconds.
[14:53:47] Finish loading and joining 'education'. △M: +67.14MB. △T: 10.7 seconds.
[14:53:58] Finish loading and joining 'consumptionAbility'. △M: +67.13MB. △T: 11.0 seconds.
[14:54:10] Finish loading and joining 'LBS'. △M: +68.38MB. △T: 11.7 seconds.
[14:54:22] Finish loading and joining 'carrier'. △M: +67.13MB. △T: 12.1 seconds.
[14:54:35] Finish loading and joining 'house'. △M: +67.13MB. △T: 12.7 seconds.


In [4]:
# ad_feat_name <=> user_feat_name1
# user_feat_name <=> user_feat_name2
for user_feat_name2, user_feat_names1 in pair_dict.items():
    row_uids, (val_to_index, matrix) = du.load_user_cnt(user_feat_name2)
    col_names = history.dict_to_list(val_to_index)
    for user_feat_name1 in user_feat_names1:
        avals_unique = train[user_feat_name1].unique()
        kfold_manager = history.MatrixCounterManager(matrix, col_names, row_uids, train, 
                                                     groupby=user_feat_name1, gvals=avals_unique)
        for i, (train_index, valid_index) in enumerate(split_indices):
            with pu.profiler("counting '{}' x '{}' fold {}/{} ...".format(user_feat_name2, user_feat_name1, 
                                                                          i + 1, n_splits)):
                mc = kfold_manager.build_matrix_counter(train_index)
                for aval in avals_unique:
                    out_path = click_history_path(n_splits, i, user_feat_name1, user_feat_name2, aval)
                    df_records = mc.group_count(aval)
                    df_records.rename(columns={1: "positive", -1:"negative"}, inplace=True)
                    df_records = df_records[["value", "positive", "negative"]]
                    df_records.to_csv(out_path, index=False)
                    del df_records
                del mc
                gc.collect()

[14:55:16] Finish counting 'carrier' x 'house' fold 1/5 .... △M: -12.91MB. △T: 34.8 seconds.
[14:55:50] Finish counting 'carrier' x 'house' fold 2/5 .... △M: +2.07MB. △T: 34.3 seconds.
[14:56:21] Finish counting 'carrier' x 'house' fold 3/5 .... △M: +4.0KB. △T: 30.9 seconds.
[14:56:52] Finish counting 'carrier' x 'house' fold 4/5 .... △M: +0B. △T: 30.8 seconds.
[14:57:24] Finish counting 'carrier' x 'house' fold 5/5 .... △M: +4.0KB. △T: 32.3 seconds.
[14:57:58] Finish counting 'LBS' x 'carrier' fold 1/5 .... △M: +41.93MB. △T: 33.5 seconds.
[14:58:31] Finish counting 'LBS' x 'carrier' fold 2/5 .... △M: -256.0KB. △T: 32.6 seconds.
[14:59:03] Finish counting 'LBS' x 'carrier' fold 3/5 .... △M: -12.0KB. △T: 32.2 seconds.
[14:59:38] Finish counting 'LBS' x 'carrier' fold 4/5 .... △M: +16.0KB. △T: 35.3 seconds.
[15:00:11] Finish counting 'LBS' x 'carrier' fold 5/5 .... △M: -8.0KB. △T: 32.6 seconds.
[15:00:42] Finish counting 'LBS' x 'house' fold 1/5 .... △M: -1.56MB. △T: 31.4 seconds.
[15:01

[15:48:35] Finish counting 'gender' x 'house' fold 5/5 .... △M: +0B. △T: 33.2 seconds.
[15:49:16] Finish counting 'consumptionAbility' x 'LBS' fold 1/5 .... △M: +36.11MB. △T: 40.8 seconds.
[15:49:57] Finish counting 'consumptionAbility' x 'LBS' fold 2/5 .... △M: -4.0KB. △T: 41.0 seconds.
[15:50:41] Finish counting 'consumptionAbility' x 'LBS' fold 3/5 .... △M: -48.0KB. △T: 44.1 seconds.
[15:51:22] Finish counting 'consumptionAbility' x 'LBS' fold 4/5 .... △M: +32.0KB. △T: 40.9 seconds.
[15:52:08] Finish counting 'consumptionAbility' x 'LBS' fold 5/5 .... △M: +248.0KB. △T: 45.3 seconds.
[15:52:44] Finish counting 'consumptionAbility' x 'carrier' fold 1/5 .... △M: -19.7MB. △T: 36.3 seconds.
[15:53:21] Finish counting 'consumptionAbility' x 'carrier' fold 2/5 .... △M: -512.0KB. △T: 37.2 seconds.
[15:53:59] Finish counting 'consumptionAbility' x 'carrier' fold 3/5 .... △M: +0B. △T: 37.2 seconds.
[15:54:34] Finish counting 'consumptionAbility' x 'carrier' fold 4/5 .... △M: +0B. △T: 35.0 sec