In [1]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import operator
import pickle
import tqdm
import time
import os
import gc
import matplotlib.pyplot as plt
import sys
sys.path.append("../../../code/utils")
sys.path.append("../../../code/feature")
sys.path.append("../../../code/pipeline")
sys.path.append("../../../code")
import data_utils as du
import perf_utils as pu
from clickrate import BayesianSmoothedClickrate
import history
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, ad_feat_name, user_feat_name, ad_val, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index), "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, ad_val)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath


def load_split_indices(num_folds):
    fold_dir = click_history_fold_dir(num_folds=num_folds, create=False)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices

In [3]:
n_splits = 5
split_indices = load_split_indices(n_splits)

In [4]:
pairs = [('productId', 'LBS'),
         ('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         # ('productType', 'topic3'),  # might help in predicting negative samples
         # ('productType', 'appIdInstall'),  # might help in predicting negative samples
         # ('productType', 'appIdAction'),  # might help in predicting negative samples
         ('aid', 'ct'),
         ('aid', 'os')]

In [5]:
pair_dict = {}
for ad_feat_name, user_feat_name in pairs:
    if user_feat_name in pair_dict:
        pair_dict[user_feat_name] += [ad_feat_name]
    else:
        pair_dict[user_feat_name] = [ad_feat_name]

In [6]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [7]:
train = pd.merge(df_train, df_ad, on="aid", how="left")

In [9]:
for user_feat_name, ad_feat_names in pair_dict.items():
    row_uids, (val_to_index, matrix) = du.load_user_cnt(user_feat_name)
    col_names = history.dict_to_list(val_to_index)
    for ad_feat_name in ad_feat_names:
        avals_unique = df_ad[ad_feat_name].unique()
        kfold_manager = history.MatrixCounterManager(matrix, col_names, row_uids, train, 
                                                     groupby=ad_feat_name, gvals=avals_unique)
        for i, (train_index, valid_index) in enumerate(split_indices):
            with pu.profiler("counting '{}' x '{}' fold {}/{} ...".format(user_feat_name, ad_feat_name, 
                                                                          i + 1, n_splits)):
                mc = kfold_manager.build_matrix_counter(train_index)
                for aval in avals_unique:
                    out_path = click_history_path(n_splits, i, ad_feat_name, user_feat_name, aval)
                    df_records = mc.group_count(aval)
                    df_records.rename(columns={1: "positive", -1:"negative"}, inplace=True)
                    df_records = df_records[["value", "positive", "negative"]]
                    df_records.to_csv(out_path, index=False)
                    del df_records
                del mc
                gc.collect()

[14:38:25] Finish counting 'ct' x 'aid' fold 1/5 .... △M: +273.85MB. △T: 27.1 seconds.
[14:38:51] Finish counting 'ct' x 'aid' fold 2/5 .... △M: +2.08MB. △T: 26.6 seconds.
[14:39:19] Finish counting 'ct' x 'aid' fold 3/5 .... △M: +0B. △T: 27.7 seconds.
[14:39:45] Finish counting 'ct' x 'aid' fold 4/5 .... △M: -47.37MB. △T: 25.5 seconds.
[14:40:11] Finish counting 'ct' x 'aid' fold 5/5 .... △M: +47.34MB. △T: 26.7 seconds.
[14:40:39] Finish counting 'topic1' x 'productType' fold 1/5 .... △M: -47.43MB. △T: 26.6 seconds.
[14:41:06] Finish counting 'topic1' x 'productType' fold 2/5 .... △M: +60.91MB. △T: 27.1 seconds.
[14:41:32] Finish counting 'topic1' x 'productType' fold 3/5 .... △M: -60.66MB. △T: 26.2 seconds.
[14:42:00] Finish counting 'topic1' x 'productType' fold 4/5 .... △M: +60.96MB. △T: 27.8 seconds.
[14:42:27] Finish counting 'topic1' x 'productType' fold 5/5 .... △M: -60.96MB. △T: 27.5 seconds.
[14:42:55] Finish counting 'interest4' x 'campaignId' fold 1/5 .... △M: +24.14MB. △T: