In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append("../code/utils")
sys.path.append("../code")
import data_utils as du
import perf_utils as pu
import config

In [2]:
user_feat_names = config.USER_FEAT_NAMES

In [3]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

In [4]:
n_splits = 5  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
# what should be customizable:
# 1. model building
# 2. model fitting
# 3. model predicting

In [6]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])

for j, user_feat_name in enumerate(user_feat_names):     
    ### given a user feature ###
    # load matrix as input to model
    with pu.profiler("loading '{}'".format(user_feat_name)):
        X_tv, cols = du.quick_join(df_train, user_feat_names=[user_feat_name])
        X_test, _ = du.quick_join(df_test, user_feat_names=[user_feat_name])

    # prepare containers
    stack_tv = np.zeros(X_tv.shape[0])
    stack_test = np.zeros((X_test.shape[0], n_splits))
    scores = np.zeros(n_splits)

    for i, (train_index, valid_index) in enumerate(split_indices):
        ### given a splitting ###
        # split train/valid sets
        X_train, y_train = X_tv[train_index], y[train_index]
        X_valid, y_valid = X_tv[valid_index], y[valid_index]

        # fit LR
        with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
            lr = LogisticRegression(solver="newton-cg")  # use default setting: penalty='l2' and C=1
            lr.fit(X_train, y_train)

        # make prediction for validation set
        proba_valid = lr.predict_proba(X_valid)[:, 1]
        stack_tv[valid_index] = proba_valid

        # make prediction for testing set
        proba_test = lr.predict_proba(X_test)[:, 1]
        stack_test[:, i] = proba_test

        # calculate scores
        auc = metrics.roc_auc_score(y_valid, proba_valid)
        scores[i] = auc

    # update dataframe for stacking
    col_name = "stackProba_{}".format(user_feat_name)
    score_row = {"featureName": user_feat_name, "auc_mean": scores.mean(), "auc_std": scores.std()}
    df_stack_tv[col_name] = stack_tv
    df_stack_test[col_name] = stack_test.mean(axis=1)
    df_score.loc[df_score.shape[0]] = score_row
    print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))
    
    del X_tv
    del X_test
    gc.collect()

loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.81s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.01s/it]


[13:11:05] Finish loading 'age'. △M: +156.56MB. △T: 38.8 seconds.
[13:11:29] Finish fitting LR (fold 1/5). △M: +268.0KB. △T: 23.7 seconds.




[13:12:55] Finish fitting LR (fold 2/5). △M: +144.0KB. △T: 1.4 minutes.
[13:13:28] Finish fitting LR (fold 3/5). △M: +26.98MB. △T: 32.8 seconds.
[13:13:55] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 26.3 seconds.
[13:15:13] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.3 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.563780(+/-0.000803)


loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.50s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.73s/it]


[13:15:53] Finish loading 'gender'. △M: +34.08MB. △T: 39.2 seconds.
[13:18:40] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 2.8 minutes.
[13:19:13] Finish fitting LR (fold 2/5). △M: +26.98MB. △T: 32.9 seconds.
[13:22:01] Finish fitting LR (fold 3/5). △M: +0B. △T: 2.8 minutes.
[13:22:35] Finish fitting LR (fold 4/5). △M: +26.98MB. △T: 33.3 seconds.
[13:23:40] Finish fitting LR (fold 5/5). △M: +26.86MB. △T: 1.1 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.511453(+/-0.000546)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.10s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.56s/it]


[13:24:20] Finish loading 'marriageStatus'. △M: +40.87MB. △T: 39.7 seconds.
[13:25:22] Finish fitting LR (fold 1/5). △M: +4.0KB. △T: 1.0 minutes.
[13:26:41] Finish fitting LR (fold 2/5). △M: +26.98MB. △T: 1.3 minutes.
[13:30:28] Finish fitting LR (fold 3/5). △M: +26.85MB. △T: 3.8 minutes.
[13:31:52] Finish fitting LR (fold 4/5). △M: +0B. △T: 1.4 minutes.
[13:33:02] Finish fitting LR (fold 5/5). △M: +26.98MB. △T: 1.1 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.539667(+/-0.00096)


loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.48s/it]
loading user matrices: 100%|██████████| 1/1 [00:16<00:00, 16.97s/it]


[13:33:41] Finish loading 'education'. △M: +67.64MB. △T: 38.5 seconds.
[13:34:12] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 30.9 seconds.
[13:35:55] Finish fitting LR (fold 2/5). △M: +26.85MB. △T: 1.7 minutes.
[13:36:31] Finish fitting LR (fold 3/5). △M: +0B. △T: 36.2 seconds.
[13:38:32] Finish fitting LR (fold 4/5). △M: +26.98MB. △T: 2.0 minutes.
[13:40:00] Finish fitting LR (fold 5/5). △M: +0B. △T: 1.5 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.539272(+/-0.000975)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.05s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.19s/it]


[13:40:40] Finish loading 'consumptionAbility'. △M: +67.63MB. △T: 39.2 seconds.
[13:41:08] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 27.7 seconds.
[13:41:47] Finish fitting LR (fold 2/5). △M: +26.98MB. △T: 38.4 seconds.
[13:42:17] Finish fitting LR (fold 3/5). △M: +26.85MB. △T: 29.2 seconds.
[13:42:48] Finish fitting LR (fold 4/5). △M: +0B. △T: 30.4 seconds.
[13:43:28] Finish fitting LR (fold 5/5). △M: +0B. △T: 39.6 seconds.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.531474(+/-0.000534)


loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.67s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.28s/it]


[13:44:07] Finish loading 'LBS'. △M: +68.13MB. △T: 39.0 seconds.
[13:45:29] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 1.3 minutes.
[13:46:59] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 1.5 minutes.
[13:50:02] Finish fitting LR (fold 3/5). △M: +160.0KB. △T: 3.0 minutes.
[13:51:22] Finish fitting LR (fold 4/5). △M: +26.85MB. △T: 1.3 minutes.
[13:52:48] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.4 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.527491(+/-0.00148)


loading user matrices: 100%|██████████| 1/1 [00:25<00:00, 25.35s/it]
loading user matrices: 100%|██████████| 1/1 [00:19<00:00, 19.10s/it]


[13:53:33] Finish loading 'interest1'. △M: +688.74MB. △T: 44.5 seconds.
[14:01:48] Finish fitting LR (fold 1/5). △M: +402.91MB. △T: 8.2 minutes.
[14:10:34] Finish fitting LR (fold 2/5). △M: -55.3MB. △T: 8.7 minutes.
[14:18:54] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 8.3 minutes.
[14:27:59] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 9.0 minutes.
[14:36:20] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 8.3 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.553203(+/-0.000424)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.14s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.89s/it]


[14:37:01] Finish loading 'interest2'. △M: +208.83MB. △T: 40.0 seconds.
[14:41:33] Finish fitting LR (fold 1/5). △M: +132.0KB. △T: 4.5 minutes.
[14:46:02] Finish fitting LR (fold 2/5). △M: -35.75MB. △T: 4.5 minutes.
[14:50:31] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 4.5 minutes.
[14:54:12] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 3.7 minutes.
[14:58:23] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 4.2 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.604180(+/-0.000825)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.29s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.43s/it]


[14:59:03] Finish loading 'interest3'. △M: +73.29MB. △T: 39.7 seconds.
[14:59:42] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 38.3 seconds.
[15:00:21] Finish fitting LR (fold 2/5). △M: +26.98MB. △T: 38.4 seconds.
[15:00:56] Finish fitting LR (fold 3/5). △M: +0B. △T: 34.9 seconds.
[15:01:33] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 35.6 seconds.
[15:02:07] Finish fitting LR (fold 5/5). △M: +26.98MB. △T: 34.0 seconds.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.502203(+/-0.000303)


loading user matrices: 100%|██████████| 1/1 [00:20<00:00, 20.76s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.03s/it]


[15:02:46] Finish loading 'interest4'. △M: +69.34MB. △T: 37.8 seconds.
[15:03:18] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 32.1 seconds.
[15:03:53] Finish fitting LR (fold 2/5). △M: +456.61MB. △T: 34.3 seconds.
[15:04:26] Finish fitting LR (fold 3/5). △M: -320.86MB. △T: 32.5 seconds.
[15:04:59] Finish fitting LR (fold 4/5). △M: +55.07MB. △T: 32.5 seconds.
[15:06:02] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.0 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.501541(+/-0.000288)


loading user matrices: 100%|██████████| 1/1 [00:24<00:00, 24.68s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.69s/it]


[15:06:45] Finish loading 'interest5'. △M: +801.14MB. △T: 43.4 seconds.
[15:57:55] Finish fitting LR (fold 1/5). △M: +26.97MB. △T: 51.1 minutes.
[16:01:31] Finish fitting LR (fold 2/5). △M: -18.37MB. △T: 3.6 minutes.
[16:05:49] Finish fitting LR (fold 3/5). △M: -19.05MB. △T: 4.3 minutes.
[16:09:54] Finish fitting LR (fold 4/5). △M: -19.67MB. △T: 4.0 minutes.
[16:14:15] Finish fitting LR (fold 5/5). △M: -57.32MB. △T: 4.3 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.571726(+/-0.00057)


loading user matrices: 100%|██████████| 1/1 [00:28<00:00, 28.46s/it]
loading user matrices: 100%|██████████| 1/1 [00:19<00:00, 19.42s/it]


[16:15:05] Finish loading 'kw1'. △M: +252.14MB. △T: 47.9 seconds.
[16:23:26] Finish fitting LR (fold 1/5). △M: +53.82MB. △T: 8.3 minutes.
[16:29:22] Finish fitting LR (fold 2/5). △M: -24.09MB. △T: 5.9 minutes.
[16:35:23] Finish fitting LR (fold 3/5). △M: -44.78MB. △T: 6.0 minutes.
[16:40:58] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 5.6 minutes.
[16:47:15] Finish fitting LR (fold 5/5). △M: +41.62MB. △T: 6.2 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.547741(+/-0.00108)


loading user matrices: 100%|██████████| 1/1 [00:24<00:00, 24.24s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.84s/it]


[16:47:59] Finish loading 'kw2'. △M: +250.78MB. △T: 43.1 seconds.
[16:53:15] Finish fitting LR (fold 1/5). △M: -26.73MB. △T: 5.2 minutes.
[16:59:02] Finish fitting LR (fold 2/5). △M: -40.6MB. △T: 5.8 minutes.
[17:03:39] Finish fitting LR (fold 3/5). △M: +0B. △T: 4.6 minutes.
[17:08:32] Finish fitting LR (fold 4/5). △M: +0B. △T: 4.8 minutes.
[17:12:53] Finish fitting LR (fold 5/5). △M: -5.0MB. △T: 4.3 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.648461(+/-0.00088)


loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.60s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.14s/it]


[17:13:33] Finish loading 'kw3'. △M: +69.55MB. △T: 38.7 seconds.
[17:14:56] Finish fitting LR (fold 1/5). △M: +3.54MB. △T: 1.4 minutes.
[17:16:21] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 1.4 minutes.
[17:17:45] Finish fitting LR (fold 3/5). △M: +26.85MB. △T: 1.4 minutes.
[17:19:13] Finish fitting LR (fold 4/5). △M: +0B. △T: 1.4 minutes.
[17:20:48] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.6 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.502396(+/-0.00038)


loading user matrices: 100%|██████████| 1/1 [00:24<00:00, 24.51s/it]
loading user matrices: 100%|██████████| 1/1 [00:19<00:00, 19.22s/it]


[17:21:33] Finish loading 'topic1'. △M: +229.91MB. △T: 43.7 seconds.
[17:25:29] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 3.9 minutes.
[17:29:16] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 3.8 minutes.
[17:33:08] Finish fitting LR (fold 3/5). △M: +8.0KB. △T: 3.8 minutes.
[17:37:10] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 4.0 minutes.
[17:40:47] Finish fitting LR (fold 5/5). △M: +0B. △T: 3.6 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.537481(+/-0.000437)


loading user matrices: 100%|██████████| 1/1 [00:24<00:00, 24.64s/it]
loading user matrices: 100%|██████████| 1/1 [00:19<00:00, 19.25s/it]


[17:41:31] Finish loading 'topic2'. △M: +297.81MB. △T: 43.9 seconds.
[17:44:38] Finish fitting LR (fold 1/5). △M: +53.83MB. △T: 3.1 minutes.
[17:47:22] Finish fitting LR (fold 2/5). △M: -21.11MB. △T: 2.7 minutes.
[17:51:02] Finish fitting LR (fold 3/5). △M: +128.0KB. △T: 3.6 minutes.
[17:53:52] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 2.8 minutes.
[17:57:05] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 3.2 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.617483(+/-0.00052)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.14s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.61s/it]


[17:57:47] Finish loading 'topic3'. △M: +73.16MB. △T: 40.8 seconds.
[17:59:10] Finish fitting LR (fold 1/5). △M: -7.55MB. △T: 1.4 minutes.
[18:00:33] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 1.4 minutes.
[18:02:05] Finish fitting LR (fold 3/5). △M: -4.92MB. △T: 1.5 minutes.
[18:03:27] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 1.4 minutes.
[18:05:16] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.8 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.502620(+/-0.000354)


loading user matrices: 100%|██████████| 1/1 [00:23<00:00, 23.41s/it]
loading user matrices: 100%|██████████| 1/1 [00:19<00:00, 19.30s/it]


[18:05:59] Finish loading 'appIdInstall'. △M: +229.54MB. △T: 42.7 seconds.
[18:09:14] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 3.2 minutes.
[18:12:52] Finish fitting LR (fold 2/5). △M: +128.0KB. △T: 3.6 minutes.
[18:17:00] Finish fitting LR (fold 3/5). △M: -22.16MB. △T: 4.1 minutes.
[18:20:32] Finish fitting LR (fold 4/5). △M: -26.85MB. △T: 3.5 minutes.
[18:23:55] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 3.3 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.500987(+/-0.000377)


loading user matrices: 100%|██████████| 1/1 [00:23<00:00, 23.16s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.76s/it]


[18:24:37] Finish loading 'appIdAction'. △M: +31.27MB. △T: 41.9 seconds.
[18:25:55] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 1.3 minutes.
[18:27:17] Finish fitting LR (fold 2/5). △M: +30.68MB. △T: 1.4 minutes.
[18:28:55] Finish fitting LR (fold 3/5). △M: +0B. △T: 1.6 minutes.
[18:30:23] Finish fitting LR (fold 4/5). △M: +132.0KB. △T: 1.5 minutes.
[18:31:49] Finish fitting LR (fold 5/5). △M: +128.0KB. △T: 1.4 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.501321(+/-0.000159)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.59s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.15s/it]


[18:32:30] Finish loading 'ct'. △M: +96.98MB. △T: 40.7 seconds.
[18:33:32] Finish fitting LR (fold 1/5). △M: -51.36MB. △T: 1.0 minutes.
[18:34:28] Finish fitting LR (fold 2/5). △M: +53.7MB. △T: 55.5 seconds.
[18:35:37] Finish fitting LR (fold 3/5). △M: +132.0KB. △T: 1.1 minutes.
[18:36:26] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 48.4 seconds.
[18:37:23] Finish fitting LR (fold 5/5). △M: -42.85MB. △T: 56.3 seconds.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.531697(+/-0.000743)


loading user matrices: 100%|██████████| 1/1 [00:21<00:00, 21.92s/it]
loading user matrices: 100%|██████████| 1/1 [00:17<00:00, 17.74s/it]


[18:38:03] Finish loading 'os'. △M: +70.15MB. △T: 39.7 seconds.
[18:38:36] Finish fitting LR (fold 1/5). △M: +0B. △T: 31.9 seconds.
[18:39:07] Finish fitting LR (fold 2/5). △M: +53.83MB. △T: 31.0 seconds.
[18:39:39] Finish fitting LR (fold 3/5). △M: +53.7MB. △T: 30.7 seconds.
[18:40:10] Finish fitting LR (fold 4/5). △M: +0B. △T: 30.8 seconds.
[18:40:42] Finish fitting LR (fold 5/5). △M: +53.7MB. △T: 31.4 seconds.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.531602(+/-0.000756)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.30s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.03s/it]


[18:41:23] Finish loading 'carrier'. △M: +67.64MB. △T: 40.3 seconds.
[18:41:44] Finish fitting LR (fold 1/5). △M: +456.61MB. △T: 20.7 seconds.
[18:44:31] Finish fitting LR (fold 2/5). △M: -402.77MB. △T: 2.8 minutes.
[18:44:54] Finish fitting LR (fold 3/5). △M: +0B. △T: 22.4 seconds.
[18:45:23] Finish fitting LR (fold 4/5). △M: +128.0KB. △T: 28.6 seconds.
[18:46:54] Finish fitting LR (fold 5/5). △M: +26.98MB. △T: 1.5 minutes.


loading user matrices:   0%|          | 0/1 [00:00<?, ?it/s]

AUC: 0.519550(+/-0.000934)


loading user matrices: 100%|██████████| 1/1 [00:22<00:00, 22.12s/it]
loading user matrices: 100%|██████████| 1/1 [00:18<00:00, 18.51s/it]


[18:47:35] Finish loading 'house'. △M: +67.63MB. △T: 40.6 seconds.
[18:48:04] Finish fitting LR (fold 1/5). △M: +128.0KB. △T: 28.6 seconds.
[18:48:32] Finish fitting LR (fold 2/5). △M: +429.03MB. △T: 27.6 seconds.
[18:48:58] Finish fitting LR (fold 3/5). △M: +53.7MB. △T: 25.6 seconds.
[18:49:24] Finish fitting LR (fold 4/5). △M: -314.5MB. △T: 25.2 seconds.
[18:49:52] Finish fitting LR (fold 5/5). △M: +0B. △T: 27.2 seconds.
AUC: 0.508010(+/-0.00027)


In [7]:
df_stack_tv.head(10)

Unnamed: 0,stackProba_age,stackProba_gender,stackProba_marriageStatus,stackProba_education,stackProba_consumptionAbility,stackProba_LBS,stackProba_interest1,stackProba_interest2,stackProba_interest3,stackProba_interest4,...,stackProba_kw3,stackProba_topic1,stackProba_topic2,stackProba_topic3,stackProba_appIdInstall,stackProba_appIdAction,stackProba_ct,stackProba_os,stackProba_carrier,stackProba_house
0,0.043626,0.049568,0.05051,0.051529,0.046068,0.048378,0.0445,0.035982,0.047759,0.047826,...,0.047639,0.066958,0.036644,0.047639,0.047781,0.047821,0.055624,0.055887,0.053454,0.048852
1,0.048879,0.044893,0.049437,0.041832,0.042099,0.042312,0.058136,0.035982,0.047759,0.047826,...,0.047639,0.048508,0.034728,0.047639,0.047781,0.047821,0.057762,0.050362,0.045917,0.048852
2,0.039171,0.049568,0.060539,0.051529,0.042099,0.044231,0.056604,0.035982,0.047759,0.047826,...,0.047639,0.04915,0.036644,0.047639,0.047781,0.047821,0.055624,0.055887,0.053454,0.048852
3,0.068508,0.049568,0.05051,0.056781,0.046068,0.049222,0.055196,0.06918,0.047759,0.047826,...,0.047639,0.059739,0.062617,0.047639,0.047781,0.047821,0.044522,0.050362,0.045917,0.043906
4,0.043626,0.049568,0.05051,0.041832,0.046068,0.047183,0.04369,0.025122,0.047759,0.047826,...,0.047639,0.050233,0.087254,0.047639,0.047781,0.047821,0.043807,0.050362,0.045917,0.048852
5,0.039171,0.049568,0.05051,0.051529,0.042099,0.049709,0.050287,0.049975,0.047759,0.047826,...,0.047639,0.053069,0.063902,0.047639,0.047781,0.047821,0.043807,0.050362,0.047285,0.048852
6,0.039171,0.049568,0.041525,0.051529,0.046068,0.047183,0.043198,0.056689,0.047759,0.047826,...,0.047639,0.025285,0.021495,0.047639,0.047781,0.047821,0.044522,0.050362,0.045917,0.048852
7,0.043626,0.049568,0.05051,0.051529,0.046068,0.045555,0.059526,0.038269,0.047759,0.047826,...,0.047639,0.035614,0.05198,0.047639,0.047781,0.047821,0.042238,0.038571,0.043453,0.048852
8,0.038886,0.049568,0.05051,0.041832,0.060178,0.048051,0.041389,0.04037,0.047759,0.047826,...,0.047639,0.054319,0.041487,0.047639,0.047781,0.047821,0.044522,0.050362,0.047285,0.048852
9,0.039171,0.049568,0.038829,0.045751,0.046068,0.052068,0.03858,0.036263,0.047759,0.047826,...,0.047639,0.040972,0.046189,0.047639,0.047781,0.047821,0.044522,0.050362,0.045917,0.043906


In [8]:
df_stack_test.head(10)

Unnamed: 0,stackProba_age,stackProba_gender,stackProba_marriageStatus,stackProba_education,stackProba_consumptionAbility,stackProba_LBS,stackProba_interest1,stackProba_interest2,stackProba_interest3,stackProba_interest4,...,stackProba_kw3,stackProba_topic1,stackProba_topic2,stackProba_topic3,stackProba_appIdInstall,stackProba_appIdAction,stackProba_ct,stackProba_os,stackProba_carrier,stackProba_house
0,0.043549,0.04955,0.060526,0.045749,0.060227,0.044722,0.040039,0.053811,0.04775,0.047816,...,0.047631,0.057269,0.100503,0.047632,0.047775,0.047813,0.043568,0.050373,0.045872,0.048849
1,0.043549,0.04955,0.050499,0.056876,0.042137,0.051015,0.053069,0.036028,0.04775,0.047816,...,0.047631,0.07695,0.026675,0.047632,0.047775,0.047813,0.05552,0.055777,0.053415,0.048849
2,0.06858,0.04955,0.050181,0.04839,0.042137,0.050936,0.054178,0.036028,0.04775,0.047816,...,0.047631,0.054069,0.036771,0.047632,0.047775,0.047813,0.05552,0.055777,0.053415,0.048849
3,0.039088,0.04955,0.04154,0.041729,0.046042,0.053905,0.033694,0.065375,0.04775,0.047816,...,0.047631,0.036667,0.056069,0.047632,0.047775,0.047813,0.044512,0.038592,0.045872,0.043913
4,0.043549,0.044932,0.060526,0.056876,0.046042,0.051015,0.059178,0.152019,0.04775,0.047816,...,0.047631,0.064597,0.204996,0.047632,0.047775,0.047813,0.044512,0.050373,0.053415,0.048849
5,0.038887,0.04955,0.050499,0.041729,0.046042,0.042522,0.045434,0.036365,0.04775,0.047816,...,0.047631,0.041631,0.035532,0.047632,0.047775,0.047813,0.044512,0.050373,0.047415,0.043913
6,0.06858,0.044932,0.050499,0.045749,0.046042,0.055481,0.057569,0.036028,0.04775,0.047816,...,0.047631,0.05354,0.01183,0.047632,0.047775,0.047813,0.044512,0.038592,0.045872,0.048849
7,0.038887,0.04955,0.050499,0.041729,0.042137,0.049601,0.054178,0.036028,0.04775,0.047816,...,0.047631,0.054069,0.029502,0.047632,0.047775,0.047813,0.05552,0.055777,0.045872,0.048849
8,0.038887,0.044932,0.037101,0.041729,0.042137,0.049162,0.043642,0.036028,0.04775,0.047816,...,0.047631,0.038126,0.089958,0.047632,0.047775,0.047813,0.05552,0.055777,0.047415,0.048849
9,0.038887,0.04955,0.050499,0.056876,0.046042,0.043079,0.055701,0.052281,0.04775,0.047816,...,0.047631,0.047661,0.037075,0.047632,0.047775,0.047813,0.042416,0.050373,0.053415,0.048849


In [9]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

Training Set Prediction Shape: (8798814, 23)
Testing Set Prediction Shape: (2265989, 23)


In [10]:
df_score.head(5)

Unnamed: 0,auc_mean,auc_std
0,0.56378,0.000803
1,0.511453,0.000546
2,0.539667,0.00096
3,0.539272,0.000975
4,0.531474,0.000534


In [12]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.singleUserFeature.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test.singleUserFeature.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [13]:
# save score information
out_file = 'score.singleUserFeature.csv'
out_path = os.path.join(out_folder, out_file)
df_score['featureName'] = user_feat_names
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [16]:
use_feats = ['age', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic2']
use_cols = ['stackProba_{}'.format(feat_name) for feat_name in use_feats]

out_folder = config.INPUT_DIR
out_file = "train.stacking.lrSingleFeature_v1.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv[use_cols].values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_feats)

with pu.profiler("saving matrix to hard disk"):
    col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

[05:01:51] Finish getting matrix represenation. △M: +234.96MB. △T: 0.6 seconds.
[05:01:52] Finish saving matrix to hard disk. △M: -234.95MB. △T: 0.7 seconds.


In [19]:
out_file = "test1.stacking.lrSingleFeature_v1.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test[use_cols].values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_feats)

with pu.profiler("saving matrix to hard disk"):
    # col_names = ['stackProba_LR_{}'.format(feat_name) for feat_name in use_feats]
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[05:03:18] Finish getting matrix represenation. △M: +397.63MB. △T: 0.7 seconds.
[05:03:18] Finish saving matrix to hard disk. △M: +0B. △T: 0.1 seconds.
