In [41]:
from functools import partial
import scipy.sparse as sparse
import multiprocessing as mp
import pandas as pd
import numpy as np
import tqdm
import copy
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [42]:
pairs = [('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         # ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         # ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         ('aid', 'ct'),
         ('aid', 'os')]

stack_folder = os.path.join(config.DATA_DIR, "stacking/clickrate")

def feature_path(ad_feat_name, user_feat_name, dataset="train"):
    stack_file = "{}.[adFeatureName='{}'][userFeatureName='{}'].pkl".format(dataset, ad_feat_name, user_feat_name)
    stack_path = os.path.join(stack_folder, stack_file)
    return stack_path

In [43]:
use_cols = ["bsClickrate@aid_x_interest2_q100",
            "bsClickrate@aid_x_interest5_q0",
            "bsClickrate@productType_x_kw2_mean",
            "bsClickrate@advertiserId_x_interest1_q0",
            "bsClickrate@aid_x_topic2_mean",
            "bsClickrate_weighted_avg@productType_x_kw2",
            "impression@campaignId_x_interest4_mean",
            "bsClickrate_weighted_avg@aid_x_topic2",
            "bsClickrate@aid_x_interest5_q100",
            "bsClickrate@advertiserId_x_interest1_q100",
            "impression@creativeSize_x_interest2_mean",
            "bsClickrate@productType_x_kw2_q100",
            "bsClickrate@aid_x_interest5_q25",
            "bsClickrate@creativeSize_x_interest2_q100",
            "bsClickrate_weighted_avg@productType_x_kw1",
            "bsClickrate@aid_x_interest5_mean",
            "bsClickrate@aid_x_interest2_q25",
            "bsClickrate@aid_x_interest2_q0",
            "bsClickrate@productType_x_kw2_q75",
            "bsClickrate@advertiserId_x_interest1_q25",
            "bsClickrate@advertiserId_x_interest1_q75",
            "bsClickrate@productType_x_kw2_q0",
            "bsClickrate@advertiserId_x_interest1_mean",
            "bsClickrate@aid_x_topic2_q50",
            "max_bsClickrate_impression@productType_x_kw2",
            "bsClickrate@aid_x_ct_q100",
            "bsClickrate@productType_x_kw2_q50",
            "min_bsClickrate_impression@aid_x_interest5",
            "bsClickrate@productType_x_kw2_std",
            "bsClickrate@advertiserId_x_interest1_std",
            "bsClickrate@aid_x_topic2_q100",
            "max_bsClickrate_impression@creativeSize_x_interest2",
            "impression@creativeSize_x_interest2_std",
            "bsClickrate@aid_x_ct_std",
            "bsClickrate@aid_x_topic2_q25",
            "min_bsClickrate_impression@productType_x_kw2",
            "bsClickrate_weighted_avg@advertiserId_x_interest1",
            "bsClickrate@productType_x_kw1_q0",
            "bsClickrate@aid_x_topic2_q0",
            "bsClickrate@aid_x_interest2_std",
            "bsClickrate@aid_x_topic2_q75",
            "max_bsClickrate_impression@aid_x_topic2",
            "impression@productType_x_topic2_mean",
            "bsClickrate@aid_x_ct_q0",
            "impression@advertiserId_x_interest1_mean",
            "min_bsClickrate_impression@advertiserId_x_interest1",
            "bsClickrate@productType_x_kw2_q25",
            "bsClickrate@creativeSize_x_interest2_q0",
            "bsClickrate@aid_x_interest5_std",
            "bsClickrate_weighted_avg@aid_x_interest5"]

In [None]:
def select_cols(cols):
    selected_cols = []
    mask = []
    for i, col in enumerate(cols):
        if col in use_cols or "weighted_avg" in col:
            selected_cols.append(col)
            mask.append(i)
    return selected_cols, mask

In [None]:
out_folder = os.path.join(config.DATA_DIR, "input_final")

col_names = []
matrix = None
for ad_feat_name, user_feat_name in pairs:
    df = du.load_pickle(feature_path(ad_feat_name, user_feat_name, "train"), use_joblib=True)
    new_col_names = df.columns.tolist()
    new_matrix = df.values.astype(np.float32)
    
    new_col_names, mask = select_cols(new_col_names)
    new_matrix = new_matrix[:, mask]
    print("'{}' x '{}': {} columns selected".format(ad_feat_name, user_feat_name, len(new_col_names)))

    col_names += new_col_names
    if matrix is None:
        matrix = new_matrix
    else:
        matrix = np.hstack((matrix, new_matrix))
        
print("Final Shape: {}".format(matrix.shape))

with pu.profiler("saving train matrix"):
    out_file = "train.clickStats.crossWordCount_v2.pkl"
    out_path = os.path.join(out_folder, out_file)
    du.save_pickle((col_names, matrix), out_path, use_joblib=False)

'advertiserId' x 'interest1': 9 columns selected
'aid' x 'interest2': 5 columns selected
'creativeSize' x 'interest2': 6 columns selected
'aid' x 'interest5': 7 columns selected
'productType' x 'kw1': 2 columns selected
'productType' x 'kw2': 10 columns selected


In [None]:
col_names = []
matrix = None
for ad_feat_name, user_feat_name in pairs:
    df = du.load_pickle(feature_path(ad_feat_name, user_feat_name, "test2"), use_joblib=True)
    new_col_names = df.columns.tolist()
    new_matrix = df.values.astype(np.float32)
    
    new_col_names, mask = select_cols(new_col_names)
    new_matrix = new_matrix[:, mask]
    print("'{}' x '{}': {} columns selected".format(ad_feat_name, user_feat_name, len(new_col_names)))

    col_names += new_col_names
    if matrix is None:
        matrix = new_matrix
    else:
        matrix = np.hstack((matrix, new_matrix))
        
print("Final Shape: {}".format(matrix.shape))

with pu.profiler("saving train matrix"):
    out_file = "test2.clickStats.crossWordCount_v2.pkl"
    out_path = os.path.join(out_folder, out_file)
    du.save_pickle((col_names, matrix), out_path, use_joblib=False)