In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
lrstack_loader = dm.build_data("stacking", "lrSingleFeature_v1")
union_loader = dp.DataUnion(bin_loader, lrstack_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[06:14:06] Finish loading training data. △M: +5.65GB. △T: 41.2 seconds.
Train Data Shape: (8798814, 419708)
Train Column Numbers: 419708


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
n_splits = 3
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

# n_splits = 3  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[06:14:40] Finish splitting train/valid set. △M: +79.92MB. △T: 25.4 seconds.
Training Set Size: (5865876, 419708)
Validation Set Size: (2932938, 419708)


In [7]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    gc.collect()

[06:15:14] Finish preparing LightGBM data. △M: +5.58GB. △T: 4.1 seconds.


In [8]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0507/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [9]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     early_stopping_rounds=50)

[1]	train's auc: 0.678105	valid1's auc: 0.675381
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.686858	valid1's auc: 0.684212
[3]	train's auc: 0.689127	valid1's auc: 0.686424
[4]	train's auc: 0.689874	valid1's auc: 0.6872
[5]	train's auc: 0.690997	valid1's auc: 0.688306
[6]	train's auc: 0.692329	valid1's auc: 0.689584
[7]	train's auc: 0.692651	valid1's auc: 0.689843
[8]	train's auc: 0.693967	valid1's auc: 0.691168
[9]	train's auc: 0.694863	valid1's auc: 0.691929
[10]	train's auc: 0.6962	valid1's auc: 0.693234
[11]	train's auc: 0.697218	valid1's auc: 0.69427
[12]	train's auc: 0.697747	valid1's auc: 0.694756
[13]	train's auc: 0.698762	valid1's auc: 0.695729
[14]	train's auc: 0.701325	valid1's auc: 0.698184
[15]	train's auc: 0.701986	valid1's auc: 0.698832
[16]	train's auc: 0.703704	valid1's auc: 0.700516
[17]	train's auc: 0.70449	valid1's auc: 0.701207
[18]	train's auc: 0.705114	valid1's auc: 0.701784
[19]	train's auc: 0.706062	valid1's auc: 0.702574
[20

[164]	train's auc: 0.768196	valid1's auc: 0.734214
[165]	train's auc: 0.768401	valid1's auc: 0.734212
[166]	train's auc: 0.768594	valid1's auc: 0.734191
[167]	train's auc: 0.768742	valid1's auc: 0.734187
[168]	train's auc: 0.768808	valid1's auc: 0.73418
[169]	train's auc: 0.769035	valid1's auc: 0.734184
[170]	train's auc: 0.769203	valid1's auc: 0.734172
[171]	train's auc: 0.76936	valid1's auc: 0.73418
[172]	train's auc: 0.769502	valid1's auc: 0.734167
[173]	train's auc: 0.769711	valid1's auc: 0.734145
[174]	train's auc: 0.769798	valid1's auc: 0.734125
[175]	train's auc: 0.769843	valid1's auc: 0.734125
[176]	train's auc: 0.770017	valid1's auc: 0.734168
[177]	train's auc: 0.770287	valid1's auc: 0.734181
[178]	train's auc: 0.770773	valid1's auc: 0.734388
[179]	train's auc: 0.771194	valid1's auc: 0.73461
[180]	train's auc: 0.771403	valid1's auc: 0.734601
[181]	train's auc: 0.771582	valid1's auc: 0.7346
[182]	train's auc: 0.771865	valid1's auc: 0.734611
[183]	train's auc: 0.772013	valid1's 

In [11]:
log_file = 'v1.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419706,stackProba_LR_kw2,999
419707,stackProba_LR_topic2,857
419703,stackProba_LR_interest2,688
419705,stackProba_LR_kw1,633
419702,stackProba_LR_interest1,593
419704,stackProba_LR_interest5,554
419701,stackProba_LR_age,469
419614,creativeSize_59,217
419699,productType_9,145
6,gender_2,134


In [12]:
def online_auc(selector, y_true, y_pred, ret_verbose=False):
    assert selector.shape[0] == y_true.shape[0]
    assert selector.shape[0] == y_pred.shape[0]
    assert np.isnan(selector).sum() == 0
    
    select_vals = np.unique(selector)
    aucs = np.zeros(len(select_vals))
    
    for i, select_val in enumerate(select_vals):
        mask = (selector==select_val)
        y_true_selected = y_true[mask]
        y_pred_selected = y_pred[mask]
        aucs[i] = metrics.roc_auc_score(y_true_selected, y_pred_selected)
    
    if ret_verbose:
        df = pd.DataFrame({"selector": select_vals, "auc": aucs})
        return df
    else:
        return aucs.mean()

In [16]:
# df_valid = df_train.iloc[valid_index]
# proba_valid = lgbm.predict(X_valid)
df_score = online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score

Unnamed: 0,auc,selector
0,0.657481,6
1,0.815082,7
2,0.856212,12
3,0.577631,18
4,0.863244,70
5,0.672458,74
6,0.614513,86
7,0.780009,98
8,0.533170,113
9,0.657214,117


In [17]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.728380
Simple AUC: 0.735078


In [18]:
log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score.to_csv(log_path, index=False)

In [19]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[06:50:57] Finish cleaning memory. △M: -12.17GB. △T: 1.6 seconds.


In [20]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()

print("Train Data Shape: {}".format(X_test.shape))
print("Train Column Numbers: {}".format(len(cols_test)))

[06:52:33] Finish loading testing data. △M: +759.56MB. △T: 9.6 seconds.
Train Data Shape: (2265989, 419708)
Train Column Numbers: 419708


In [21]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[06:53:43] Finish making prediction on testing set. △M: +464.0KB. △T: 22.0 seconds.


In [22]:
subm_folder = '../../../subm/lgbm/0507_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)