In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [3]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         # ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         # ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         ('aid', 'ct'),
         ('aid', 'os')]

In [4]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

In [5]:
n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [8]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])

for j, (ad_feat_name, user_feat_name) in enumerate(pairs):
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}' x '{}'".format(user_feat_name, ad_feat_name)):
        cross_bin_loader = dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
        cols, X_tv,  = cross_bin_loader.load("train")
        _, X_test = cross_bin_loader.load("test2")

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg", n_jobs=-1)  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    cross_name = "{}_x_{}".format(ad_feat_name, user_feat_name)
    col_name = "stackProba_{}".format(cross_name)
    score_row = {"featureName": cross_name, 
                 "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()

[13:02:32] Finish loading 'interest1' x 'advertiserId'. △M: +149.61MB. △T: 1.0 seconds.




[13:16:18] Finish fitting LR (fold 1/5). △M: +43.29MB. △T: 13.7 minutes.




[13:33:17] Finish fitting LR (fold 2/5). △M: +16.09MB. △T: 16.9 minutes.
[13:45:44] Finish fitting LR (fold 3/5). △M: +0B. △T: 12.4 minutes.
[13:59:12] Finish fitting LR (fold 4/5). △M: +12.0KB. △T: 13.4 minutes.
[14:14:20] Finish fitting LR (fold 5/5). △M: +15.86MB. △T: 15.1 minutes.
AUC: 0.595944(+/-0.000632)
[14:14:22] Finish loading 'interest2' x 'aid'. △M: +210.73MB. △T: 0.4 seconds.
[14:19:33] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 5.2 minutes.
[14:24:50] Finish fitting LR (fold 2/5). △M: +0B. △T: 5.3 minutes.




[14:31:04] Finish fitting LR (fold 3/5). △M: +8.0KB. △T: 6.2 minutes.
[14:35:57] Finish fitting LR (fold 4/5). △M: +0B. △T: 4.8 minutes.
[14:40:45] Finish fitting LR (fold 5/5). △M: -59.64MB. △T: 4.8 minutes.
AUC: 0.656101(+/-0.000822)
[14:40:46] Finish loading 'interest2' x 'creativeSize'. △M: +139.82MB. △T: 0.3 seconds.
[14:46:49] Finish fitting LR (fold 1/5). △M: -76.0KB. △T: 6.0 minutes.
[14:52:30] Finish fitting LR (fold 2/5). △M: -256.0KB. △T: 5.6 minutes.
[14:58:03] Finish fitting LR (fold 3/5). △M: +0B. △T: 5.5 minutes.
[15:04:04] Finish fitting LR (fold 4/5). △M: +0B. △T: 6.0 minutes.
[15:13:43] Finish fitting LR (fold 5/5). △M: +0B. △T: 9.6 minutes.
AUC: 0.634668(+/-0.000534)
[15:13:46] Finish loading 'interest5' x 'aid'. △M: +768.37MB. △T: 1.5 seconds.
[15:41:43] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 27.8 minutes.
[15:58:43] Finish fitting LR (fold 2/5). △M: +0B. △T: 16.9 minutes.
[16:12:42] Finish fitting LR (fold 3/5). △M: +0B. △T: 13.9 minutes.
[16:26:25] Finish f



[17:10:51] Finish fitting LR (fold 4/5). △M: +20.0KB. △T: 8.5 minutes.
[17:17:44] Finish fitting LR (fold 5/5). △M: +0B. △T: 6.9 minutes.
AUC: 0.557730(+/-0.000875)
[17:17:47] Finish loading 'kw2' x 'productType'. △M: +159.62MB. △T: 1.3 seconds.
[17:23:34] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 5.8 minutes.
[17:29:41] Finish fitting LR (fold 2/5). △M: +0B. △T: 6.1 minutes.
[17:36:02] Finish fitting LR (fold 3/5). △M: +0B. △T: 6.3 minutes.
[17:42:01] Finish fitting LR (fold 4/5). △M: +0B. △T: 5.9 minutes.
[17:46:47] Finish fitting LR (fold 5/5). △M: +0B. △T: 4.7 minutes.
AUC: 0.664312(+/-0.000894)
[17:46:49] Finish loading 'topic1' x 'productType'. △M: -27.17MB. △T: 0.7 seconds.
[17:51:18] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 4.5 minutes.
[17:56:06] Finish fitting LR (fold 2/5). △M: -256.0KB. △T: 4.8 minutes.
[18:00:20] Finish fitting LR (fold 3/5). △M: +0B. △T: 4.2 minutes.
[18:04:38] Finish fitting LR (fold 4/5). △M: +0B. △T: 4.3 minutes.
[18:08:28] Finish fitting LR (



[19:13:10] Finish fitting LR (fold 1/5). △M: -36.0KB. △T: 6.9 minutes.
[19:17:49] Finish fitting LR (fold 2/5). △M: -512.0KB. △T: 4.6 minutes.
[19:22:25] Finish fitting LR (fold 3/5). △M: +0B. △T: 4.6 minutes.
[19:28:21] Finish fitting LR (fold 4/5). △M: +0B. △T: 5.9 minutes.
[19:32:54] Finish fitting LR (fold 5/5). △M: +0B. △T: 4.5 minutes.
AUC: 0.634358(+/-0.00062)
[19:32:55] Finish loading 'ct' x 'aid'. △M: -61.5MB. △T: 0.2 seconds.
[19:36:40] Finish fitting LR (fold 1/5). △M: +8.0KB. △T: 3.7 minutes.
[19:41:09] Finish fitting LR (fold 2/5). △M: -256.0KB. △T: 4.4 minutes.
[19:44:42] Finish fitting LR (fold 3/5). △M: +0B. △T: 3.5 minutes.
[19:48:15] Finish fitting LR (fold 4/5). △M: +0B. △T: 3.5 minutes.




[19:53:33] Finish fitting LR (fold 5/5). △M: +0B. △T: 5.3 minutes.
AUC: 0.557264(+/-0.000941)
[19:53:34] Finish loading 'os' x 'aid'. △M: +0B. △T: 0.2 seconds.
[19:55:26] Finish fitting LR (fold 1/5). △M: +55.72MB. △T: 1.8 minutes.
[19:57:03] Finish fitting LR (fold 2/5). △M: +0B. △T: 1.6 minutes.
[19:58:40] Finish fitting LR (fold 3/5). △M: +0B. △T: 1.6 minutes.
[20:00:20] Finish fitting LR (fold 4/5). △M: +26.86MB. △T: 1.6 minutes.
[20:02:06] Finish fitting LR (fold 5/5). △M: +0B. △T: 1.7 minutes.
AUC: 0.554914(+/-0.000776)


In [9]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

Training Set Prediction Shape: (8798814, 11)
Testing Set Prediction Shape: (2265879, 11)


In [10]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test2.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [11]:
# save score information
out_file = 'score.crossBinary_v2.csv'
out_path = os.path.join(out_folder, out_file)
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score = df_score.sort_values("auc_mean", ascending=False)
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [14]:
use_pairs = [('advertiserId', 'interest1'),
             ('aid', 'interest2'),
             ('creativeSize', 'interest2'), 
             # ('campaignId', 'interest4'),  # whether to keep it? 
             ('aid', 'interest5'),  
             ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
             ('productType', 'kw2'),
             # ('productType', 'kw3'),
             ('productType', 'topic1'),
             ('aid', 'topic2'),
             ('productType', 'topic2'),
             ('aid', 'ct'),
             ('aid', 'os')]
use_cols = ['stackProba_{}_x_{}'.format(ad_feat_name, user_feat_name) 
            for ad_feat_name, user_feat_name in use_pairs]

In [15]:
out_folder = config.INPUT_DIR
out_file = "train.stacking.lrCrossBinary_v2.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    col_names = ['stackProba_LR_{}_x_{}'.format(ad_feat_name, user_feat_name) for 
                 ad_feat_name, user_feat_name in use_pairs]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

[20:09:38] Finish getting matrix represenation. △M: +309.57MB. △T: 3.0 seconds.
[20:09:38] Finish saving matrix to hard disk. △M: -369.21MB. △T: 0.4 seconds.


In [None]:
out_file = "test2.stacking.lrCrossBinary_v2.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()