In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from glove import Glove
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append("../../../code/utils/")
sys.path.append("../../../code/analysis/")
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code')
import data_pipeline as dp
import data_jointer as dj
import eval_utils as eu
import data_utils as du
import perf_utils as pu
import config

In [2]:
embedding_folder = os.path.join(config.DATA_DIR, "embedding")

def embedding_path(feat_name, pooling="avg", version_no=1, dataset="train"):
    emb_folder = os.path.join(embedding_folder, "[featureName='{}']".format(feat_name))
    emb_file = "{}.{}_v{}.pkl".format(dataset, pooling, version_no)
    emb_path = os.path.join(emb_folder, emb_file)
    return emb_path

In [3]:
use_feats = ["interest2", "kw2", "topic2"]

df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
y = df_train["label"].values.copy()
y = (y + 1) / 2  # 1/-1 to 1/0

n_splits = 2  # 3? 5? Don't know which one will be better
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [8]:
df_stack_tv = pd.DataFrame()
df_stack_test = pd.DataFrame()
df_score = pd.DataFrame(columns=["featureName", "auc_mean", "auc_std"])
version_no = 1


for feat_name in use_feats:
    folder = os.path.join(embedding_folder, "[featureName='{}']".format(feat_name))
    data_manager = dp.DataManager(folder)
    
    for pooling in ["avg", "min"]:
        ### given a feature name and a pooling scheme ###
        # load matrix as input to model
        with pu.profiler("loading '{}' ({} pooling)".format(feat_name, pooling)):
            emb_loader = data_manager.build_data("{}_v{}".format(pooling, version_no))
            cols, X_tv = emb_loader.load("train")
            _, X_test = emb_loader.load("test2")
            
        # prepare containers
        stack_tv = np.zeros(X_tv.shape[0])
        stack_test = np.zeros((X_test.shape[0], n_splits))
        scores = np.zeros(n_splits)
        
        for i, (train_index, valid_index) in enumerate(split_indices):
            ### given a splitting ###
            # split train/valid sets
            X_train, y_train = X_tv[train_index], y[train_index]
            X_valid, y_valid = X_tv[valid_index], y[valid_index]

            # fit LR
            with pu.profiler("fitting LR (fold {}/{})".format(i + 1, n_splits)):
                lr = LogisticRegression(solver="newton-cg", n_jobs=-1)  # use default setting: penalty='l2' and C=1
                lr.fit(X_train, y_train)

            # make prediction for validation set
            proba_valid = lr.predict_proba(X_valid)[:, 1]
            stack_tv[valid_index] = proba_valid

            # make prediction for testing set
            proba_test = lr.predict_proba(X_test)[:, 1]
            stack_test[:, i] = proba_test

            # calculate scores
            auc = metrics.roc_auc_score(y_valid, proba_valid)
            scores[i] = auc
            
         # update dataframe for stacking
        emb_name = "{}_emb_{}Pooling".format(feat_name, pooling)
        col_name = "stackProba_{}".format(emb_name)
        score_row = {"featureName": emb_name, 
                     "auc_mean": scores.mean(), 
                     "auc_std": scores.std()}
        df_stack_tv[col_name] = stack_tv
        df_stack_test[col_name] = stack_test.mean(axis=1)
        df_score.loc[df_score.shape[0]] = score_row
        print("AUC: {:.6f}(+/-{:.3g})".format(score_row["auc_mean"], score_row["auc_std"]))

        del X_tv
        del X_test
        gc.collect()

[18:32:15] Finish loading 'interest2' (min pooling). △M: +4.0KB. △T: 1.6 seconds.




[18:40:59] Finish fitting LR (fold 1/2). △M: +16.0MB. △T: 8.7 minutes.




[18:51:26] Finish fitting LR (fold 2/2). △M: -16.0KB. △T: 10.4 minutes.
AUC: 0.578048(+/-0.000136)
[18:51:34] Finish loading 'kw2' (avg pooling). △M: +2.06GB. △T: 5.9 seconds.




[18:58:15] Finish fitting LR (fold 1/2). △M: -8.0KB. △T: 6.7 minutes.




[19:07:40] Finish fitting LR (fold 2/2). △M: -24.0KB. △T: 9.3 minutes.
AUC: 0.634801(+/-2.35e-05)
[19:07:48] Finish loading 'kw2' (min pooling). △M: +2.06GB. △T: 5.0 seconds.




[19:22:52] Finish fitting LR (fold 1/2). △M: -12.0KB. △T: 15.0 minutes.




[19:32:47] Finish fitting LR (fold 2/2). △M: -12.0KB. △T: 9.9 minutes.
AUC: 0.619814(+/-6.63e-05)
[19:32:55] Finish loading 'topic2' (avg pooling). △M: +2.06GB. △T: 5.2 seconds.




[19:41:43] Finish fitting LR (fold 1/2). △M: -8.0KB. △T: 8.8 minutes.




[19:50:09] Finish fitting LR (fold 2/2). △M: -24.0KB. △T: 8.4 minutes.
AUC: 0.595004(+/-0.000834)
[19:50:16] Finish loading 'topic2' (min pooling). △M: +2.06GB. △T: 4.9 seconds.




[20:02:09] Finish fitting LR (fold 1/2). △M: -16.0KB. △T: 11.8 minutes.




[20:11:17] Finish fitting LR (fold 2/2). △M: -24.0KB. △T: 9.1 minutes.
AUC: 0.587005(+/-0.000801)


In [9]:
print("Training Set Prediction Shape: {}".format(df_stack_tv.shape))
print("Testing Set Prediction Shape: {}".format(df_stack_test.shape))

Training Set Prediction Shape: (8798814, 6)
Testing Set Prediction Shape: (2265879, 6)


In [10]:
assert os.path.exists(config.DATA_DIR)
out_folder = os.path.join(config.DATA_DIR, 'stacking/lr')
os.makedirs(out_folder, exist_ok=True)

# remember to format float number or you will find these really hard-disk consumptive
# save prediction for training set
out_file = 'train.embedding.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_tv.to_csv(out_path, float_format="%.6f", index=False)

# save prediction for testing set
out_file = 'test2.embedding.csv'
out_path = os.path.join(out_folder, out_file)
df_stack_test.to_csv(out_path, float_format="%.6f", index=False)

In [11]:
# save score information
out_file = 'score.embedding.csv'
out_path = os.path.join(out_folder, out_file)
df_score = df_score[["featureName", "auc_mean", "auc_std"]]
df_score = df_score.sort_values("auc_mean", ascending=False)
df_score.to_csv(out_path, float_format="%.6f", index=False)

In [13]:
out_folder = config.INPUT_DIR
out_file = "train.stacking.embedding.pkl"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)

with pu.profiler("getting matrix represenation"):
    X_train = df_stack_tv.values.astype(np.float32)
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(use_feats) * 2

with pu.profiler("saving matrix to hard disk"):
    col_names = df_stack_tv.columns.tolist()
    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

[20:15:16] Finish getting matrix represenation. △M: +0B. △T: 0.2 seconds.
[20:15:16] Finish saving matrix to hard disk. △M: -201.16MB. △T: 0.6 seconds.


In [14]:
out_file = "test2.stacking.embedding.pkl"
out_path = os.path.join(out_folder, out_file)

with pu.profiler("getting matrix represenation"):
    X_test = df_stack_test.values.astype(np.float32)
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(use_feats) * 2

with pu.profiler("saving matrix to hard disk"):
    col_names = df_stack_test.columns.tolist()
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[20:15:20] Finish getting matrix represenation. △M: +103.73MB. △T: 0.2 seconds.
[20:15:20] Finish saving matrix to hard disk. △M: +0B. △T: 0.2 seconds.
