In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [("LBS", "carrier"), ("LBS", "house"), ("LBS", "gender")]

df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [3]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])
cross_bin_folder = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "user_cross/")

for j, (ad_feat_name, user_feat_name) in enumerate(pairs):
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}' x '{}'".format(user_feat_name, ad_feat_name)):
        data_manager = dp.DataManager(cross_bin_folder)
        cross_bin_loader = data_manager.build_data("['{}'x'{}']".format(ad_feat_name, user_feat_name), 
                                                   "binary")
        cols, X_tv,  = cross_bin_loader.load("train")
        _, X_test = cross_bin_loader.load("test2")

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg", n_jobs=-1)  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    cross_name = "{}_x_{}".format(ad_feat_name, user_feat_name)
    col_name = "stackProba_{}".format(cross_name)
    score_row = {"featureName": cross_name, 
                 "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()

[12:35:45] Finish loading 'carrier' x 'LBS'. △M: +93.26MB. △T: 0.1 seconds.
[12:36:53] Finish fitting LR (fold 1/5). △M: -3.78MB. △T: 1.1 minutes.
[12:38:09] Finish fitting LR (fold 2/5). △M: +26.98MB. △T: 1.3 minutes.
[12:39:19] Finish fitting LR (fold 3/5). △M: +0B. △T: 1.1 minutes.
[12:40:36] Finish fitting LR (fold 4/5). △M: -4.09MB. △T: 1.3 minutes.
[12:41:57] Finish fitting LR (fold 5/5). △M: +15.93MB. △T: 1.3 minutes.
AUC: 0.533698(+/-0.00109)
[12:41:59] Finish loading 'house' x 'LBS'. △M: +33.57MB. △T: 0.1 seconds.
[12:43:18] Finish fitting LR (fold 1/5). △M: +26.87MB. △T: 1.3 minutes.
[12:44:57] Finish fitting LR (fold 2/5). △M: +16.0KB. △T: 1.6 minutes.
[12:46:16] Finish fitting LR (fold 3/5). △M: +0B. △T: 1.3 minutes.
[12:47:32] Finish fitting LR (fold 4/5). △M: +0B. △T: 1.2 minutes.
[12:48:47] Finish fitting LR (fold 5/5). △M: +0B. △T: 1.2 minutes.
AUC: 0.529398(+/-0.00126)
[12:48:48] Finish loading 'gender' x 'LBS'. △M: +0B. △T: 0.1 seconds.
[12:50:09] Finish fitting LR (f

In [4]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

Training Set Prediction Shape: (8798814, 3)
Testing Set Prediction Shape: (2265879, 3)


In [None]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.crossBinary_v3.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test2.crossBinary_v3.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [None]:
# save score information
out_file = 'score.crossBinary_v3.csv'
out_path = os.path.join(out_folder, out_file)
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score = df_score.sort_values("auc_mean", ascending=False)
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [8]:
use_pairs = [("LBS", "carrier"), ("LBS", "house"), ("LBS", "gender")]
use_cols = ['stackProba_{}_x_{}'.format(ad_feat_name, user_feat_name) 
            for ad_feat_name, user_feat_name in use_pairs]

In [18]:
out_folder = config.INPUT_DIR
out_file = "train.stacking.lrCrossBinary_v3.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    col_names = ['stackProba_LR_{}_x_{}'.format(ad_feat_name, user_feat_name) for 
                 ad_feat_name, user_feat_name in use_pairs]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

[09:04:52] Finish getting matrix represenation. △M: +0B. △T: 1.1 seconds.
[09:04:54] Finish saving matrix to hard disk. △M: -402.57MB. △T: 1.3 seconds.


In [19]:
out_file = "test2.stacking.lrCrossBinary_v3.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_pairs)

with pu.profiler("saving matrix to hard disk"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[09:04:59] Finish getting matrix represenation. △M: +164.34MB. △T: 0.5 seconds.
[09:05:00] Finish saving matrix to hard disk. △M: +0B. △T: 0.2 seconds.
