In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from glove import Glove
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append("../code/utils/")
sys.path.append("../code/analysis/")
sys.path.append('../code/pipeline/')
sys.path.append('../code')
import data_jointer as dj
import eval_utils as eu
import data_utils as du
import perf_utils as pu
import config

In [3]:
emb_folder = os.path.join(config.DATA_DIR, "embedding")

def avg_pooling_path(feat_name, version_no=1):
    folder = os.path.join(emb_folder, "[featureName='{}']".format(feat_name))
    file = "train.avg_v{}.pkl".format(version_no)
    path = os.path.join(folder, file)
    return path

def max_pooling_path(feat_name, version_no=1):
    folder = os.path.join(emb_folder, "[featureName='{}']".format(feat_name))
    file = "train.max_v{}.pkl".format(version_no)
    path = os.path.join(folder, file)
    return path

def min_pooling_path(feat_name, version_no=1):
    folder = os.path.join(emb_folder, "[featureName='{}']".format(feat_name))
    file = "train.min_v{}.pkl".format(version_no)
    path = os.path.join(folder, file)
    return path

In [4]:
user_jointer = dj.PandasMatrixJointer(on="uid")
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2

In [5]:
n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # should set random_state
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [6]:
def run_lr_on_embedding(feat_name, version_no=1):
    print("Start '{}'".format(feat_name))
    # =========
    # load data
    # =========
    col_names_avg, X_avg = du.load_pickle(avg_pooling_path(feat_name, version_no))
    col_names_max, X_max = du.load_pickle(max_pooling_path(feat_name, version_no))
    col_names_min, X_min = du.load_pickle(min_pooling_path(feat_name, version_no))

    # =========
    # join data
    # =========
    row_uids, (_, matrix_bin) = du.load_user_cnt(feat_name)
    X_bin = user_jointer.join(df_train, matrix_bin, row_names=row_uids)
    del matrix_bin
    gc.collect()

    # =====================
    # split train/valid set
    # =====================
    aids = df_train['aid'].values
    with pu.profiler("splitting train/valid set"):
        train_index, valid_index = split_indices[0]
        X_bin_train, X_bin_valid = X_bin[train_index, :], X_bin[valid_index, :]
        X_avg_train, X_avg_valid = X_avg[train_index, :], X_avg[valid_index, :]
        X_max_train, X_max_valid = X_max[train_index, :], X_max[valid_index, :]
        X_min_train, X_min_valid = X_min[train_index, :], X_min[valid_index, :]
        y_train, y_valid = y[train_index], y[valid_index]
        aids_train, aids_valid = aids[train_index], aids[valid_index]

        del X_bin
        del X_avg
        del X_max
        del X_min
        gc.collect()
    print("Training Set Size: {}".format(X_bin_train.shape))
    print("Validation Set Size: {}".format(X_bin_valid.shape))

    # =========================
    # fit LR on binary features
    # =========================
    with pu.profiler("fitting LR on binary data"):
        lr_bin = LogisticRegression(solver='newton-cg')
        lr_bin.fit(X_bin_train, y_train)
    proba_bin_valid = lr_bin.predict_proba(X_bin_valid)
    auc_bin = metrics.roc_auc_score(y_valid, proba_bin_valid[:, 1])
    auc_online_bin = eu.online_auc(aids_valid, y_valid, proba_bin_valid[:, 1])
    print("online AUC: {:.6f} overall AUC: {:.6f}".format(auc_online_bin, auc_bin))

    # ==============================
    # fit LR on avg pooling features
    # ==============================
    with pu.profiler("fitting LR on avg pooling data"):
        lr_avg = LogisticRegression(solver='newton-cg')
        lr_avg.fit(X_avg_train, y_train)
    proba_avg_valid = lr_avg.predict_proba(X_avg_valid)
    auc_avg = metrics.roc_auc_score(y_valid, proba_avg_valid[:, 1])
    auc_online_avg = eu.online_auc(aids_valid, y_valid, proba_avg_valid[:, 1])
    print("online AUC: {:.6f} overall AUC: {:.6f}".format(auc_online_avg, auc_avg))

    # ==============================
    # fit LR on max pooling features
    # ==============================
    with pu.profiler("fitting LR on max pooling data"):
        lr_max = LogisticRegression(solver='newton-cg')
        lr_max.fit(X_max_train, y_train)
    proba_max_valid = lr_max.predict_proba(X_max_valid)
    auc_max = metrics.roc_auc_score(y_valid, proba_max_valid[:, 1])
    auc_online_max = eu.online_auc(aids_valid, y_valid, proba_max_valid[:, 1])
    print("online AUC: {:.6f} overall AUC: {:.6f}".format(auc_online_max, auc_max))

    # ==============================
    # fit LR on min pooling features
    # ==============================
    with pu.profiler("fitting LR on min pooling data"):
        lr_min = LogisticRegression(solver='newton-cg')
        lr_min.fit(X_min_train, y_train)
    proba_min_valid = lr_min.predict_proba(X_min_valid)
    auc_min = metrics.roc_auc_score(y_valid, proba_min_valid[:, 1])
    auc_online_min = eu.online_auc(aids_valid, y_valid, proba_min_valid[:, 1])
    print("online AUC: {:.6f} overall AUC: {:.6f}".format(auc_online_min, auc_min))

In [6]:
run_lr_on_embedding("marriageStatus", 1)

Start 'marriageStatus'
[16:28:34] Finish splitting train/valid set. △M: +143.09MB. △T: 2.1 seconds.
Training Set Size: (7039050, 13)
Validation Set Size: (1759764, 13)
[16:29:48] Finish fitting LR on binary data. △M: +27.12MB. △T: 1.2 minutes.
online AUC: 0.534100 overall AUC: 0.539769




[16:37:53] Finish fitting LR on avg pooling data. △M: -26.64MB. △T: 8.1 minutes.
online AUC: 0.533642 overall AUC: 0.538242
[16:46:32] Finish fitting LR on max pooling data. △M: -26.73MB. △T: 8.6 minutes.
online AUC: 0.533516 overall AUC: 0.539087
[16:54:56] Finish fitting LR on min pooling data. △M: -26.73MB. △T: 8.4 minutes.
online AUC: 0.533397 overall AUC: 0.539040


In [7]:
run_lr_on_embedding("interest1", 1)

Start 'interest1'
[16:55:36] Finish splitting train/valid set. △M: +117.54MB. △T: 3.7 seconds.
Training Set Size: (7039050, 123)
Validation Set Size: (1759764, 123)
[17:07:07] Finish fitting LR on binary data. △M: +27.02MB. △T: 11.5 minutes.
online AUC: 0.546140 overall AUC: 0.553479




[17:15:46] Finish fitting LR on avg pooling data. △M: -34.61MB. △T: 8.6 minutes.
online AUC: 0.529071 overall AUC: 0.534084
[17:22:44] Finish fitting LR on max pooling data. △M: -26.73MB. △T: 6.9 minutes.
online AUC: 0.529160 overall AUC: 0.535543
[17:29:59] Finish fitting LR on min pooling data. △M: -40.15MB. △T: 7.2 minutes.
online AUC: 0.525293 overall AUC: 0.529668


In [None]:
run_lr_on_embedding("interest2", 1)

Start 'interest2'
[17:30:34] Finish splitting train/valid set. △M: +104.27MB. △T: 2.5 seconds.
Training Set Size: (7039050, 81)
Validation Set Size: (1759764, 81)
[17:35:51] Finish fitting LR on binary data. △M: +26.98MB. △T: 5.3 minutes.
online AUC: 0.628470 overall AUC: 0.603646




[17:44:46] Finish fitting LR on avg pooling data. △M: +128.0KB. △T: 8.9 minutes.
online AUC: 0.604777 overall AUC: 0.585076
[17:53:09] Finish fitting LR on max pooling data. △M: -40.15MB. △T: 8.3 minutes.
online AUC: 0.604315 overall AUC: 0.582341




[17:57:29] Finish fitting LR on min pooling data. △M: -40.13MB. △T: 4.3 minutes.
online AUC: 0.603939 overall AUC: 0.581508


In [6]:
run_lr_on_embedding("interest3", 1)

Start 'interest3'
[05:17:55] Finish splitting train/valid set. △M: +143.04MB. △T: 1.5 seconds.
Training Set Size: (7039050, 11)
Validation Set Size: (1759764, 11)
[05:18:37] Finish fitting LR on binary data. △M: +272.0KB. △T: 42.1 seconds.
online AUC: 0.502058 overall AUC: 0.502593




[05:21:09] Finish fitting LR on avg pooling data. △M: +27.09MB. △T: 2.5 minutes.
online AUC: 0.502058 overall AUC: 0.502590
[05:23:22] Finish fitting LR on max pooling data. △M: +0B. △T: 2.2 minutes.
online AUC: 0.502067 overall AUC: 0.502597
[05:29:39] Finish fitting LR on min pooling data. △M: +13.43MB. △T: 6.3 minutes.
online AUC: 0.502047 overall AUC: 0.502590


In [7]:
run_lr_on_embedding("interest4", 1)

Start 'interest4'
[05:30:11] Finish splitting train/valid set. △M: +120.51MB. △T: 1.8 seconds.
Training Set Size: (7039050, 11)
Validation Set Size: (1759764, 11)
[05:30:52] Finish fitting LR on binary data. △M: +128.0KB. △T: 41.2 seconds.
online AUC: 0.501932 overall AUC: 0.501946




[05:33:05] Finish fitting LR on avg pooling data. △M: +140.0KB. △T: 2.2 minutes.
online AUC: 0.501780 overall AUC: 0.501961




[05:37:47] Finish fitting LR on max pooling data. △M: +13.55MB. △T: 4.7 minutes.
online AUC: 0.501787 overall AUC: 0.501964
[05:39:10] Finish fitting LR on min pooling data. △M: +0B. △T: 1.4 minutes.
online AUC: 0.501783 overall AUC: 0.501961


In [8]:
run_lr_on_embedding("interest5", 1)

Start 'interest5'
[05:39:47] Finish splitting train/valid set. △M: +113.83MB. △T: 3.1 seconds.
Training Set Size: (7039050, 137)
Validation Set Size: (1759764, 137)
[05:47:15] Finish fitting LR on binary data. △M: +168.0KB. △T: 7.5 minutes.
online AUC: 0.575147 overall AUC: 0.572632




[05:51:31] Finish fitting LR on avg pooling data. △M: +4.7MB. △T: 4.2 minutes.
online AUC: 0.514943 overall AUC: 0.516792




[05:58:11] Finish fitting LR on max pooling data. △M: +4.0KB. △T: 6.6 minutes.
online AUC: 0.517660 overall AUC: 0.523211
[06:04:04] Finish fitting LR on min pooling data. △M: +4.0KB. △T: 5.9 minutes.
online AUC: 0.524761 overall AUC: 0.530063


In [12]:
run_lr_on_embedding("ct", 1)

Start 'ct'
[07:02:03] Finish splitting train/valid set. △M: +104.2MB. △T: 1.4 seconds.
Training Set Size: (7039050, 5)
Validation Set Size: (1759764, 5)
[07:03:15] Finish fitting LR on binary data. △M: +128.0KB. △T: 1.2 minutes.
online AUC: 0.526482 overall AUC: 0.532387




[07:08:50] Finish fitting LR on avg pooling data. △M: +26.98MB. △T: 5.5 minutes.
online AUC: 0.523301 overall AUC: 0.528339




[07:12:45] Finish fitting LR on max pooling data. △M: +0B. △T: 3.9 minutes.
online AUC: 0.526151 overall AUC: 0.532243
[07:17:12] Finish fitting LR on min pooling data. △M: +0B. △T: 4.4 minutes.
online AUC: 0.526143 overall AUC: 0.532251


In [13]:
run_lr_on_embedding("kw1", 1)

Start 'kw1'
[08:54:25] Finish splitting train/valid set. △M: +120.39MB. △T: 7.8 seconds.
Training Set Size: (7039050, 259909)
Validation Set Size: (1759764, 259909)




[09:05:01] Finish fitting LR on binary data. △M: +128.0KB. △T: 10.6 minutes.
online AUC: 0.550057 overall AUC: 0.548704
[09:12:31] Finish fitting LR on avg pooling data. △M: -32.38MB. △T: 7.5 minutes.
online AUC: 0.540223 overall AUC: 0.547055
[09:22:01] Finish fitting LR on max pooling data. △M: +18.43MB. △T: 9.5 minutes.
online AUC: 0.536563 overall AUC: 0.541304
[09:33:44] Finish fitting LR on min pooling data. △M: +0B. △T: 11.7 minutes.
online AUC: 0.534985 overall AUC: 0.541529


In [14]:
run_lr_on_embedding("kw2", 1)

Start 'kw2'
[09:54:04] Finish splitting train/valid set. △M: +72.23MB. △T: 7.0 seconds.
Training Set Size: (7039050, 49197)
Validation Set Size: (1759764, 49197)
[09:59:52] Finish fitting LR on binary data. △M: +26.98MB. △T: 5.8 minutes.
online AUC: 0.664064 overall AUC: 0.647717




[10:08:21] Finish fitting LR on avg pooling data. △M: -14.08MB. △T: 8.4 minutes.
online AUC: 0.653359 overall AUC: 0.634279
[10:26:54] Finish fitting LR on max pooling data. △M: +17.25MB. △T: 18.5 minutes.
online AUC: 0.644345 overall AUC: 0.622837
[10:34:01] Finish fitting LR on min pooling data. △M: +0B. △T: 7.1 minutes.
online AUC: 0.644360 overall AUC: 0.626391


In [7]:
run_lr_on_embedding("kw3", 1)

Start 'kw3'
[11:54:58] Finish splitting train/valid set. △M: +150.98MB. △T: 6.7 seconds.
Training Set Size: (7039050, 11922)
Validation Set Size: (1759764, 11922)
[11:56:57] Finish fitting LR on binary data. △M: +27.16MB. △T: 2.0 minutes.
online AUC: 0.502343 overall AUC: 0.502829




[12:06:36] Finish fitting LR on avg pooling data. △M: +27.09MB. △T: 9.6 minutes.
online AUC: 0.502562 overall AUC: 0.502128
[12:27:21] Finish fitting LR on max pooling data. △M: +0B. △T: 20.7 minutes.
online AUC: 0.503382 overall AUC: 0.503591
[12:31:21] Finish fitting LR on min pooling data. △M: +0B. △T: 4.0 minutes.
online AUC: 0.502510 overall AUC: 0.502082


In [8]:
run_lr_on_embedding("topic1", 1)

Start 'topic1'
[13:53:17] Finish splitting train/valid set. △M: +94.91MB. △T: 7.6 seconds.
Training Set Size: (7039050, 10001)
Validation Set Size: (1759764, 10001)
[13:58:05] Finish fitting LR on binary data. △M: +33.7MB. △T: 4.8 minutes.
online AUC: 0.537458 overall AUC: 0.538209




[14:06:24] Finish fitting LR on avg pooling data. △M: -25.86MB. △T: 8.3 minutes.
online AUC: 0.535427 overall AUC: 0.540196
[14:18:37] Finish fitting LR on max pooling data. △M: -16.4MB. △T: 12.2 minutes.
online AUC: 0.534128 overall AUC: 0.538042
[14:36:39] Finish fitting LR on min pooling data. △M: -16.26MB. △T: 18.0 minutes.
online AUC: 0.531775 overall AUC: 0.535339


In [9]:
run_lr_on_embedding("topic2", 1)

Start 'topic2'
[15:18:38] Finish splitting train/valid set. △M: +93.26MB. △T: 8.2 seconds.
Training Set Size: (7039050, 9980)
Validation Set Size: (1759764, 9980)
[15:22:58] Finish fitting LR on binary data. △M: +33.69MB. △T: 4.3 minutes.
online AUC: 0.631544 overall AUC: 0.617089




[15:34:27] Finish fitting LR on avg pooling data. △M: -14.4MB. △T: 11.4 minutes.
online AUC: 0.610037 overall AUC: 0.593369
[15:43:58] Finish fitting LR on max pooling data. △M: -13.85MB. △T: 9.5 minutes.
online AUC: 0.603722 overall AUC: 0.587165
[15:54:00] Finish fitting LR on min pooling data. △M: -13.92MB. △T: 10.0 minutes.
online AUC: 0.600704 overall AUC: 0.583861


In [10]:
run_lr_on_embedding("topic3", 1)

Start 'topic3'
[16:39:10] Finish splitting train/valid set. △M: +119.61MB. △T: 6.8 seconds.
Training Set Size: (7039050, 5873)
Validation Set Size: (1759764, 5873)
[16:40:56] Finish fitting LR on binary data. △M: +26.98MB. △T: 1.8 minutes.
online AUC: 0.503807 overall AUC: 0.503117




[16:54:50] Finish fitting LR on avg pooling data. △M: +26.98MB. △T: 13.9 minutes.
online AUC: 0.503207 overall AUC: 0.502456
[17:10:39] Finish fitting LR on max pooling data. △M: +0B. △T: 15.8 minutes.
online AUC: 0.502682 overall AUC: 0.503196
[17:24:21] Finish fitting LR on min pooling data. △M: +0B. △T: 13.7 minutes.
online AUC: 0.502532 overall AUC: 0.503124


In [11]:
run_lr_on_embedding("appIdInstall", 1)

Start 'appIdInstall'
[19:22:30] Finish splitting train/valid set. △M: +105.51MB. △T: 4.1 seconds.
Training Set Size: (7039050, 64856)
Validation Set Size: (1759764, 64856)
[19:25:41] Finish fitting LR on binary data. △M: +128.0KB. △T: 3.2 minutes.
online AUC: 0.501202 overall AUC: 0.500495




[19:38:41] Finish fitting LR on avg pooling data. △M: +13.55MB. △T: 13.0 minutes.
online AUC: 0.501739 overall AUC: 0.502227
[19:53:28] Finish fitting LR on max pooling data. △M: +26.85MB. △T: 14.8 minutes.
online AUC: 0.501868 overall AUC: 0.502233
[20:03:39] Finish fitting LR on min pooling data. △M: +0B. △T: 10.2 minutes.
online AUC: 0.501573 overall AUC: 0.502056


In [12]:
run_lr_on_embedding("appIdAction", 1)

Start 'appIdAction'
[20:04:15] Finish splitting train/valid set. △M: +119.92MB. △T: 5.5 seconds.
Training Set Size: (7039050, 6215)
Validation Set Size: (1759764, 6215)
[20:05:31] Finish fitting LR on binary data. △M: +57.54MB. △T: 1.3 minutes.
online AUC: 0.502193 overall AUC: 0.501611




[20:18:46] Finish fitting LR on avg pooling data. △M: +26.98MB. △T: 13.2 minutes.
online AUC: 0.500572 overall AUC: 0.500448
[20:31:52] Finish fitting LR on max pooling data. △M: +0B. △T: 13.1 minutes.
online AUC: 0.501209 overall AUC: 0.501049




[20:45:16] Finish fitting LR on min pooling data. △M: +0B. △T: 13.4 minutes.
online AUC: 0.501477 overall AUC: 0.501299
