In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import eval_utils as eu
import data_utils as du
import perf_utils as pu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
lrstack_loader = dm.build_data("stacking", "lrCrossBinary_v1")
union_loader = dp.DataUnion(bin_loader, lrstack_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[16:22:53] Finish loading training data. △M: +5.45GB. △T: 40.6 seconds.
Train Data Shape: (8798814, 419705)
Train Column Numbers: 419705


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
n_splits = 3
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

# n_splits = 3  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[16:23:28] Finish splitting train/valid set. △M: +124.67MB. △T: 25.7 seconds.
Training Set Size: (5865876, 419705)
Validation Set Size: (2932938, 419705)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    gc.collect()

[16:23:32] Finish preparing LightGBM data. △M: +5.41GB. △T: 3.9 seconds.


In [7]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0515/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [8]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.66336	train's online_auc: 0.658006	valid1's auc: 0.662242	valid1's online_auc: 0.656208
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.680743	train's online_auc: 0.674956	valid1's auc: 0.679551	valid1's online_auc: 0.672782
[3]	train's auc: 0.684775	train's online_auc: 0.678691	valid1's auc: 0.683821	valid1's online_auc: 0.676296
[4]	train's auc: 0.686443	train's online_auc: 0.680956	valid1's auc: 0.685714	valid1's online_auc: 0.678673
[5]	train's auc: 0.687421	train's online_auc: 0.682572	valid1's auc: 0.686549	valid1's online_auc: 0.68016
[6]	train's auc: 0.688934	train's online_auc: 0.684494	valid1's auc: 0.687904	valid1's online_auc: 0.681663
[7]	train's auc: 0.690613	train's online_auc: 0.686775	valid1's auc: 0.689355	valid1's online_auc: 0.683724
[8]	train's auc: 0.691211	train's online_auc: 0.687568	valid1's auc: 0.69003	valid1's online_auc: 0.684441
[9]	train's auc: 0.693951	train's online_auc: 0.689677	valid1's auc: 0.692709

[76]	train's auc: 0.742494	train's online_auc: 0.741128	valid1's auc: 0.728728	valid1's online_auc: 0.723624
[77]	train's auc: 0.742853	train's online_auc: 0.741527	valid1's auc: 0.728808	valid1's online_auc: 0.723727
[78]	train's auc: 0.74319	train's online_auc: 0.741962	valid1's auc: 0.728888	valid1's online_auc: 0.723824
[79]	train's auc: 0.743504	train's online_auc: 0.742196	valid1's auc: 0.729031	valid1's online_auc: 0.723957
[80]	train's auc: 0.743871	train's online_auc: 0.742552	valid1's auc: 0.729118	valid1's online_auc: 0.724022
[81]	train's auc: 0.744124	train's online_auc: 0.742909	valid1's auc: 0.729211	valid1's online_auc: 0.724115
[82]	train's auc: 0.744512	train's online_auc: 0.743386	valid1's auc: 0.72937	valid1's online_auc: 0.724261
[83]	train's auc: 0.74499	train's online_auc: 0.74391	valid1's auc: 0.729579	valid1's online_auc: 0.724495
[84]	train's auc: 0.745299	train's online_auc: 0.744313	valid1's auc: 0.729637	valid1's online_auc: 0.72453
[85]	train's auc: 0.7457

[152]	train's auc: 0.764644	train's online_auc: 0.766553	valid1's auc: 0.734187	valid1's online_auc: 0.728727
[153]	train's auc: 0.764825	train's online_auc: 0.766774	valid1's auc: 0.734218	valid1's online_auc: 0.728744
[154]	train's auc: 0.765118	train's online_auc: 0.767014	valid1's auc: 0.734278	valid1's online_auc: 0.728808
[155]	train's auc: 0.765377	train's online_auc: 0.767387	valid1's auc: 0.73434	valid1's online_auc: 0.728885
[156]	train's auc: 0.765566	train's online_auc: 0.767626	valid1's auc: 0.734362	valid1's online_auc: 0.728916
[157]	train's auc: 0.765895	train's online_auc: 0.767977	valid1's auc: 0.734392	valid1's online_auc: 0.728917
[158]	train's auc: 0.766111	train's online_auc: 0.768223	valid1's auc: 0.734393	valid1's online_auc: 0.728899
[159]	train's auc: 0.766352	train's online_auc: 0.768559	valid1's auc: 0.734398	valid1's online_auc: 0.728888
[160]	train's auc: 0.766555	train's online_auc: 0.768763	valid1's auc: 0.734385	valid1's online_auc: 0.728921
[161]	train

[227]	train's auc: 0.779901	train's online_auc: 0.784946	valid1's auc: 0.735505	valid1's online_auc: 0.73001
[228]	train's auc: 0.780016	train's online_auc: 0.785048	valid1's auc: 0.735544	valid1's online_auc: 0.730029
[229]	train's auc: 0.78028	train's online_auc: 0.785616	valid1's auc: 0.735548	valid1's online_auc: 0.73007
[230]	train's auc: 0.780405	train's online_auc: 0.785706	valid1's auc: 0.735537	valid1's online_auc: 0.730048
[231]	train's auc: 0.780529	train's online_auc: 0.785824	valid1's auc: 0.735551	valid1's online_auc: 0.730038
[232]	train's auc: 0.780717	train's online_auc: 0.786015	valid1's auc: 0.735537	valid1's online_auc: 0.730034
[233]	train's auc: 0.780838	train's online_auc: 0.786141	valid1's auc: 0.735542	valid1's online_auc: 0.730054
[234]	train's auc: 0.781018	train's online_auc: 0.786311	valid1's auc: 0.735553	valid1's online_auc: 0.730066
[235]	train's auc: 0.781239	train's online_auc: 0.786548	valid1's auc: 0.735602	valid1's online_auc: 0.730121
[236]	train's

In [9]:
# log_file = 'v1.feature_importance.csv'
# log_path = os.path.join(log_folder, log_file)

# df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
# df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
# df_feature_importance.to_csv(log_path, index=False)
# df_feature_importance.head(30)

In [13]:
# df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid)
df_score = eu.online_auc(aids_valid, y_valid, proba_valid, ret_verbose=True)
df_score = df_score.sort_values("auc", ascending=False)
df_score

Unnamed: 0,aid,auc
161,2054,0.936592
126,1672,0.932723
44,613,0.930529
39,519,0.929706
54,725,0.916698
149,1957,0.914612
48,671,0.911197
27,336,0.907132
145,1930,0.906762
35,450,0.905588


In [14]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.730226
Simple AUC: 0.735670


In [15]:
log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score.to_csv(log_path, index=False)

In [16]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[03:40:03] Finish cleaning memory. △M: -11.97GB. △T: 1.4 seconds.


In [17]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()

print("Train Data Shape: {}".format(X_test.shape))
print("Train Column Numbers: {}".format(len(cols_test)))

[03:40:26] Finish loading testing data. △M: +1.41GB. △T: 8.4 seconds.
Train Data Shape: (2265989, 419705)
Train Column Numbers: 419705


In [18]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[03:40:50] Finish making prediction on testing set. △M: +0B. △T: 23.3 seconds.


In [19]:
subm_folder = '../../../subm/lgbm/0515_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)