In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [3]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         # ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         # ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         ('aid', 'ct'),
         ('aid', 'os')]

In [4]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

In [5]:
n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [None]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])

for j, (ad_feat_name, user_feat_name) in enumerate(pairs):
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}' x '{}'".format(user_feat_name, ad_feat_name)):
        cross_bin_loader = dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
        cols, X_tv,  = cross_bin_loader.load("train")
        _, X_test = cross_bin_loader.load("test2")

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg", n_jobs=-1)  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    cross_name = "{}_x_{}".format(ad_feat_name, user_feat_name)
    col_name = "stackProba_{}".format(cross_name)
    score_row = {"featureName": cross_name, 
                 "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()

[13:02:32] Finish loading 'interest1' x 'advertiserId'. △M: +149.61MB. △T: 1.0 seconds.




[13:16:18] Finish fitting LR (fold 1/5). △M: +43.29MB. △T: 13.7 minutes.




[13:33:17] Finish fitting LR (fold 2/5). △M: +16.09MB. △T: 16.9 minutes.
[13:45:44] Finish fitting LR (fold 3/5). △M: +0B. △T: 12.4 minutes.
[13:59:12] Finish fitting LR (fold 4/5). △M: +12.0KB. △T: 13.4 minutes.
[14:14:20] Finish fitting LR (fold 5/5). △M: +15.86MB. △T: 15.1 minutes.
AUC: 0.595944(+/-0.000632)
[14:14:22] Finish loading 'interest2' x 'aid'. △M: +210.73MB. △T: 0.4 seconds.
[14:19:33] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 5.2 minutes.


In [None]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

In [None]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test2.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [None]:
# save score information
out_file = 'score.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score = df_score.sort_values("auc_mean", ascending=False)
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [None]:
use_pairs = [('productId', 'LBS'),
             ('advertiserId', 'interest1'),
             ('aid', 'interest2'),
             ('creativeSize', 'interest2'), 
             # ('campaignId', 'interest4'),  # whether to keep it? 
             ('aid', 'interest5'),  
             ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
             ('productType', 'kw2'),
             # ('productType', 'kw3'),
             ('productType', 'topic1'),
             ('aid', 'topic2'),
             ('productType', 'topic2'),
             ('aid', 'ct'),
             ('aid', 'os')]
use_cols = ['stackProba_{}_x_{}'.format(ad_feat_name, user_feat_name) 
            for ad_feat_name, user_feat_name in use_pairs]

In [None]:
out_folder = config.INPUT_DIR
out_file = "train.stacking.lrCrossBinary_v2.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    col_names = ['stackProba_LR_{}_x_{}'.format(ad_feat_name, user_feat_name) for 
                 ad_feat_name, user_feat_name in use_pairs]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

In [None]:
out_file = "test2.stacking.lrCrossBinary_v2.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()