In [1]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import operator
import pickle
import tqdm
import time
import os
import gc
import matplotlib.pyplot as plt
import sys
sys.path.append("../../../code/utils")
sys.path.append("../../../code/feature")
sys.path.append("../../../code/pipeline")
sys.path.append("../../../code")
import data_utils as du
import perf_utils as pu
from clickrate import BayesianSmoothedClickrate
import history
import config

In [2]:
clickhist_folder = os.path.join(config.DATA_DIR, "click_history/simple_cross/byUserFeatureName")


def click_history_fold_dir(num_folds, create=True):
    folder = "{}[StratifiedKFold_{}]".format(clickhist_folder, num_folds)
    if create:
        os.makedirs(folder, exist_ok=True)
    return folder


def click_history_path(num_folds, fold_index, ad_feat_name, user_feat_name, ad_val, create=True):
    folder = click_history_fold_dir(num_folds, create)
    folder = os.path.join(folder, str(fold_index), "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, ad_val)
    filepath = os.path.join(folder, filename)
    if create:
        os.makedirs(folder, exist_ok=True)
    return filepath


def load_split_indices(num_folds):
    fold_dir = click_history_fold_dir(num_folds=num_folds, create=False)
    index_file = "indices.pkl"
    index_path = os.path.join(fold_dir, index_file)
    split_indices = du.load_pickle(index_path)
    return split_indices

In [3]:
n_splits = 5
split_indices = load_split_indices(n_splits)

pairs = [('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         ('aid', 'ct'),
         ('aid', 'os')]

pair_dict = {}
for ad_feat_name, user_feat_name in pairs:
    if user_feat_name in pair_dict:
        pair_dict[user_feat_name] += [ad_feat_name]
    else:
        pair_dict[user_feat_name] = [ad_feat_name]
        
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[0]
train = pd.merge(df_train, df_ad, on="aid", how="left")

In [4]:
for user_feat_name, ad_feat_names in pair_dict.items():
    row_uids, (val_to_index, matrix) = du.load_user_cnt(user_feat_name)
    col_names = history.dict_to_list(val_to_index)
    for ad_feat_name in ad_feat_names:
        avals_unique = df_ad[ad_feat_name].unique()
        kfold_manager = history.MatrixCounterManager(matrix, col_names, row_uids, train, 
                                                     groupby=ad_feat_name, gvals=avals_unique)
        for i, (train_index, valid_index) in enumerate(split_indices):
            with pu.profiler("counting '{}' x '{}' fold {}/{} ...".format(user_feat_name, ad_feat_name, 
                                                                          i + 1, n_splits)):
                mc = kfold_manager.build_matrix_counter(train_index)
                for aval in avals_unique:
                    out_path = click_history_path(n_splits, i, ad_feat_name, user_feat_name, aval)
                    df_records = mc.group_count(aval)
                    df_records.rename(columns={1: "positive", -1:"negative"}, inplace=True)
                    df_records = df_records[["value", "positive", "negative"]]
                    df_records.to_csv(out_path, index=False)
                    del df_records
                del mc
                gc.collect()

[11:42:18] Finish counting 'interest1' x 'advertiserId' fold 1/5 .... △M: +230.5MB. △T: 52.3 seconds.
[11:43:06] Finish counting 'interest1' x 'advertiserId' fold 2/5 .... △M: +57.97MB. △T: 47.9 seconds.
[11:43:52] Finish counting 'interest1' x 'advertiserId' fold 3/5 .... △M: -856.0KB. △T: 46.5 seconds.
[11:44:37] Finish counting 'interest1' x 'advertiserId' fold 4/5 .... △M: +8.0KB. △T: 44.8 seconds.
[11:45:22] Finish counting 'interest1' x 'advertiserId' fold 5/5 .... △M: +0B. △T: 45.1 seconds.
[11:46:07] Finish counting 'topic1' x 'productType' fold 1/5 .... △M: +15.88MB. △T: 44.3 seconds.
[14:03:35] Finish counting 'topic1' x 'productType' fold 2/5 .... △M: +12.0KB. △T: 2.3 hours.
[14:04:07] Finish counting 'topic1' x 'productType' fold 3/5 .... △M: -51.41MB. △T: 31.7 seconds.
[14:04:39] Finish counting 'topic1' x 'productType' fold 4/5 .... △M: +19.42MB. △T: 32.4 seconds.
[14:05:11] Finish counting 'topic1' x 'productType' fold 5/5 .... △M: +0B. △T: 31.6 seconds.
[14:05:46] Finis