In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]

In [3]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

In [4]:
n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
# what should be customizable:
# 1. model building
# 2. model fitting
# 3. model predicting

In [6]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])

for j, (ad_feat_name, user_feat_name) in enumerate(pairs):
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}' x '{}'".format(user_feat_name, ad_feat_name)):
        cross_bin_loader = dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
        cols, X_tv,  = cross_bin_loader.load("train")
        _, X_test = cross_bin_loader.load("test1")

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg")  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    cross_name = "{}_x_{}".format(ad_feat_name, user_feat_name)
    col_name = "stackProba_{}".format(cross_name)
    score_row = {"featureName": cross_name, 
                 "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()

[09:17:26] Finish loading 'age' x 'aid'. △M: +109.95MB. △T: 0.2 seconds.
[09:19:10] Finish fitting LR (fold 1/5). △M: +296.0KB. △T: 1.7 minutes.
[09:20:50] Finish fitting LR (fold 2/5). △M: +27.01MB. △T: 1.6 minutes.




[09:24:01] Finish fitting LR (fold 3/5). △M: +224.0KB. △T: 3.2 minutes.
[09:25:36] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 1.6 minutes.
[09:27:14] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.6 minutes.
AUC: 0.615922(+/-0.00127)
[09:27:15] Finish loading 'education' x 'aid'. △M: +67.13MB. △T: 0.2 seconds.
[09:29:55] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 2.7 minutes.
[09:31:27] Finish fitting LR (fold 2/5). △M: -13.3MB. △T: 1.5 minutes.
[09:32:57] Finish fitting LR (fold 3/5). △M: +26.98MB. △T: 1.5 minutes.
[09:34:23] Finish fitting LR (fold 4/5). △M: +26.98MB. △T: 1.4 minutes.
[09:35:57] Finish fitting LR (fold 5/5). △M: +0B. △T: 1.6 minutes.
AUC: 0.562716(+/-0.000967)
[09:35:58] Finish loading 'consumptionAbility' x 'aid'. △M: +33.57MB. △T: 0.1 seconds.
[09:39:45] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 3.8 minutes.
[09:41:12] Finish fitting LR (fold 2/5). △M: -32.55MB. △T: 1.4 minutes.
[09:42:34] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 1.4 minut

In [7]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

Training Set Prediction Shape: (8798814, 4)
Testing Set Prediction Shape: (2265989, 4)


In [8]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.crossBinary_v1.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test.crossBinary_v1.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [9]:
# save score information
out_file = 'score.crossBinary_v1.csv'
out_path = os.path.join(out_folder, out_file)
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score = df_score.sort_values("auc_mean", ascending=False)
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [13]:
use_pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]
use_cols = ['stackProba_{}_x_{}'.format(ad_feat_name, user_feat_name) 
            for ad_feat_name, user_feat_name in use_pairs]

In [18]:
out_folder = config.INPUT_DIR
out_file = "train.stacking.lrCrossBinary_v1.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    col_names = ['stackProba_LR_{}_x_{}'.format(ad_feat_name, user_feat_name) for 
                 ad_feat_name, user_feat_name in use_pairs]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

[09:04:52] Finish getting matrix represenation. △M: +0B. △T: 1.1 seconds.
[09:04:54] Finish saving matrix to hard disk. △M: -402.57MB. △T: 1.3 seconds.


In [19]:
out_file = "test1.stacking.lrCrossBinary_v1.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[09:04:59] Finish getting matrix represenation. △M: +164.34MB. △T: 0.5 seconds.
[09:05:00] Finish saving matrix to hard disk. △M: +0B. △T: 0.2 seconds.
