In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import time
import gc
import os
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("user", "tfidf")
union_loader = dp.DataUnion(bin_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()
print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

[06:10:25] Finish loading training data. △M: +4.75GB. △T: 5.9 seconds.
Train Data Shape: (8798814, 419220)
Train Column Numbers: 419220


In [3]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test2")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
print("Test Data Shape: {}".format(X_test.shape))
print("Test Column Numbers: {}".format(len(cols_test)))

[06:11:33] Finish loading testing data. △M: +1.24GB. △T: 2.4 seconds.
Test Data Shape: (2265879, 419220)
Test Column Numbers: 419220


In [4]:
df_test = du.load_raw_data("test2")

In [9]:
n_splits = 2  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [10]:
stack_tv = np.zeros(X_tv.shape[0])
stack_test = np.zeros((X_test.shape[0], n_splits))
scores = np.zeros(n_splits)

In [11]:
for i, (train_index, valid_index) in enumerate(split_indices):
    ### given a splitting ###
    # split train/valid sets
    X_train, y_train = X_tv[train_index], y[train_index]
    X_valid, y_valid = X_tv[valid_index], y[valid_index]

    # fit LR
    with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
        lr = LogisticRegression(solver="newton-cg", n_jobs=-1)  # use default setting: penalty='l2' and C=1
        lr.fit(X_train, y_train)

    # make prediction for validation set
    proba_valid = lr.predict_proba(X_valid)[:, 1]
    stack_tv[valid_index] = proba_valid

    # make prediction for testing set
    proba_test = lr.predict_proba(X_test)[:, 1]
    stack_test[:, i] = proba_test

    # calculate scores
    auc = metrics.roc_auc_score(y_valid, proba_valid)
    scores[i] = auc



[08:23:26] Finish fitting LR (fold 1/2). △M: -7.44GB. △T: 1.3 hours.




[10:02:36] Finish fitting LR (fold 2/2). △M: +8.0KB. △T: 1.6 hours.


In [13]:
print("Overall AUC: {:.6f}(+/-{:.3g})".format(scores.mean(), scores.std()))

Overall AUC: 0.659131(+/-0.000112)


In [32]:
out_folder = os.path.join(config.DATA_DIR, "stacking/lr")
os.makedirs(out_folder, exist_ok=True)

out_file = "train.userTfIdf.pkl"
out_path = os.path.join(out_folder, out_file)
with pu.profiler("saving train prediction"):
    col_names = ['stackProba_LR_userTfIdf']
    data_tv = stack_tv.reshape((-1, 1)).astype(np.float32)
    du.save_pickle((col_names, data_tv), out_path)
    gc.collect()
print("Saved Shape: {}".format(data_tv.shape))
    
out_file = "test2.userTfIdf.pkl"
out_path = os.path.join(out_folder, out_file)
with pu.profiler("saving test prediction"):
    data_test = stack_test.mean(axis=1).reshape((-1, 1)).astype(np.float32)
    du.save_pickle((col_names, data_test), out_path)
    gc.collect()
print("Saved Shape: {}".format(data_test.shape))

[11:00:54] Finish saving train prediction. △M: -3.73GB. △T: 1.0 seconds.
Saved Shape: (8798814, 1)
[11:00:54] Finish saving test prediction. △M: +0B. △T: 0.1 seconds.
Saved Shape: (2265879, 1)
