In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append("../code/utils")
sys.path.append("../code")
import data_utils as du
import perf_utils as pu
import config

In [2]:
user_feat_names = config.USER_FEAT_NAMES
use_feats = ['age', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic2']

df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [3]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])

for j, user_feat_name in enumerate(use_feats):     
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}'".format(user_feat_name)):
        X_tv, cols = du.quick_join(df_train, user_feat_names=[user_feat_name], verbose=False)
        X_test, _ = du.quick_join(df_test, user_feat_names=[user_feat_name], verbose=False)

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg")  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    col_name = "stackProba_{}".format(user_feat_name)
    score_row = {"featureName": user_feat_name, "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()
    
    
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

[04:17:36] Finish loading 'age'. △M: +156.61MB. △T: 56.4 seconds.
[04:18:04] Finish fitting LR (fold 1/5). △M: +272.0KB. △T: 28.0 seconds.




[04:19:45] Finish fitting LR (fold 2/5). △M: +144.0KB. △T: 1.7 minutes.
[04:20:16] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 30.0 seconds.
[04:20:49] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 32.9 seconds.
[04:22:50] Finish fitting LR (fold 5/5). △M: +26.98MB. △T: 2.0 minutes.
AUC: 0.563780(+/-0.000803)
[04:23:45] Finish loading 'interest1'. △M: +688.96MB. △T: 53.6 seconds.
[04:33:36] Finish fitting LR (fold 1/5). △M: +168.0KB. △T: 9.8 minutes.
[04:42:51] Finish fitting LR (fold 2/5). △M: +0B. △T: 9.2 minutes.
[04:51:30] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 8.6 minutes.
[05:00:44] Finish fitting LR (fold 4/5). △M: -52.32MB. △T: 9.2 minutes.
[05:09:22] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 8.6 minutes.
AUC: 0.553203(+/-0.000424)
[05:10:20] Finish loading 'interest2'. △M: +244.84MB. △T: 56.9 seconds.
[05:15:07] Finish fitting LR (fold 1/5). △M: -41.14MB. △T: 4.8 minutes.
[05:19:00] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 3.9 minutes.
[05:23:41] Fi

In [5]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# # remember to format float number or you will find these really hard-disk consumptive
# # save prediction for training set
# out_file = 'train.singleUserFeature.csv'
# out_path = os.path.join(out_folder, out_file)
# df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test2.singleUserFeature.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

# # save score information
# out_file = 'score.singleUserFeature.csv'
# out_path = os.path.join(out_folder, out_file)
# df_score['featureName'] = user_feat_names
# df_score = df_score[["featureName", "auc_mean", "auc_std"]]
# df_score.to_csv(out_path, float_format="%.6f", index=False)

In [None]:
use_feats = ['age', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic2']
use_cols = ['stackProba_{}'.format(feat_name) for feat_name in use_feats]

out_folder = config.INPUT_DIR
out_file = "train.stacking.lrSingleFeature_v1.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting train matrix"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_feats)

with pu.profiler("saving train matrix"):
    col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()
    
out_file = "test2.stacking.lrSingleFeature_v1.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting test matrix"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_feats)

with pu.profiler("saving test matrix"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()