In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
# pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]
pairs = [('productId', 'LBS'),
         ('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         # ('productType', 'topic3'),  # might help in predicting negative samples
         # ('productType', 'appIdInstall'),  # might help in predicting negative samples
         # ('productType', 'appIdAction'),  # might help in predicting negative samples
         ('aid', 'ct'),
         ('aid', 'os')]

# instantiate data loader
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
cross_bin_loaders = (dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
                     for ad_feat_name, user_feat_name in pairs)
union_loader = dp.DataUnion(bin_loader, *cross_bin_loaders)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[03:59:08] Finish loading training data. △M: +6.23GB. △T: 5.9 minutes.
Train Data Shape: (8798814, 3590073)
Train Column Numbers: 3590073


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [6]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

print("Training Set Size (Before masking): {}".format(X_train.shape))
print("Validation Set Size (Before masking): {}".format(X_valid.shape))
print()
    
min_df = 3  # there is no such things for v2
with pu.profiler("trimming train/valid set"):    
    mask = (X_train.getnnz(axis=0) >= min_df)  # threshold to be extended
    cols_masked = list(compress(cols_train, mask))
    X_train = X_train[:, mask]
    X_valid = X_valid[:, mask]
    gc.collect()
    
print("Training Set Size (After masking): {}".format(X_train.shape))
print("Validation Set Size (After masking): {}".format(X_valid.shape))

[04:01:47] Finish splitting train/valid set. △M: +181.8MB. △T: 38.3 seconds.
Training Set Size (Before masking): (7039050, 3590073)
Validation Set Size (Before masking): (1759764, 3590073)

[04:04:19] Finish trimming train/valid set. △M: +40.53MB. △T: 2.5 minutes.
Training Set Size (After masking): (7039050, 1791142)
Validation Set Size (After masking): (1759764, 1791142)


In [8]:
with pu.profiler("preparing LightGBM data"):
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[04:05:10] Finish preparing LightGBM data. △M: +9.45GB. △T: 9.3 seconds.


In [9]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0512/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [11]:
# v2 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 15,
#     'num_leaves': 120,
#     'learning_rate': 0.15,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'verbose': 0
# }
# v3 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 64,
    'learning_rate': 0.1,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'verbose': 0
}
num_rounds = 5000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.570279	train's online_auc: 0.583275	valid1's auc: 0.570255	valid1's online_auc: 0.583206
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.622847	train's online_auc: 0.622842	valid1's auc: 0.621829	valid1's online_auc: 0.621137
[3]	train's auc: 0.631631	train's online_auc: 0.632271	valid1's auc: 0.630437	valid1's online_auc: 0.631378
[4]	train's auc: 0.646026	train's online_auc: 0.640083	valid1's auc: 0.645401	valid1's online_auc: 0.640113
[5]	train's auc: 0.650872	train's online_auc: 0.64747	valid1's auc: 0.65005	valid1's online_auc: 0.64815
[6]	train's auc: 0.651037	train's online_auc: 0.647926	valid1's auc: 0.650157	valid1's online_auc: 0.648676
[7]	train's auc: 0.652834	train's online_auc: 0.651013	valid1's auc: 0.651913	valid1's online_auc: 0.652333
[8]	train's auc: 0.652544	train's online_auc: 0.651005	valid1's auc: 0.651652	valid1's online_auc: 0.652247
[9]	train's auc: 0.653752	train's online_auc: 0.650823	valid1's auc: 0.65294	

[77]	train's auc: 0.704712	train's online_auc: 0.700067	valid1's auc: 0.701894	valid1's online_auc: 0.697543
[78]	train's auc: 0.705092	train's online_auc: 0.700472	valid1's auc: 0.702225	valid1's online_auc: 0.697961
[79]	train's auc: 0.70526	train's online_auc: 0.700651	valid1's auc: 0.702388	valid1's online_auc: 0.698161
[80]	train's auc: 0.706061	train's online_auc: 0.701428	valid1's auc: 0.703141	valid1's online_auc: 0.698914
[81]	train's auc: 0.706341	train's online_auc: 0.701709	valid1's auc: 0.703376	valid1's online_auc: 0.699134
[82]	train's auc: 0.70659	train's online_auc: 0.701954	valid1's auc: 0.703501	valid1's online_auc: 0.699223
[83]	train's auc: 0.707161	train's online_auc: 0.70259	valid1's auc: 0.704001	valid1's online_auc: 0.699579
[84]	train's auc: 0.707372	train's online_auc: 0.702626	valid1's auc: 0.704181	valid1's online_auc: 0.699534
[85]	train's auc: 0.707672	train's online_auc: 0.702856	valid1's auc: 0.704418	valid1's online_auc: 0.699702
[86]	train's auc: 0.70

[152]	train's auc: 0.720169	train's online_auc: 0.714586	valid1's auc: 0.714812	valid1's online_auc: 0.708979
[153]	train's auc: 0.720265	train's online_auc: 0.714743	valid1's auc: 0.714855	valid1's online_auc: 0.708982
[154]	train's auc: 0.720364	train's online_auc: 0.714794	valid1's auc: 0.714921	valid1's online_auc: 0.709016
[155]	train's auc: 0.720456	train's online_auc: 0.714842	valid1's auc: 0.71501	valid1's online_auc: 0.70905
[156]	train's auc: 0.720508	train's online_auc: 0.714898	valid1's auc: 0.715057	valid1's online_auc: 0.709106
[157]	train's auc: 0.720556	train's online_auc: 0.714989	valid1's auc: 0.715107	valid1's online_auc: 0.709179
[158]	train's auc: 0.720694	train's online_auc: 0.715061	valid1's auc: 0.715199	valid1's online_auc: 0.709177
[159]	train's auc: 0.72078	train's online_auc: 0.715154	valid1's auc: 0.715254	valid1's online_auc: 0.709246
[160]	train's auc: 0.720926	train's online_auc: 0.715254	valid1's auc: 0.715367	valid1's online_auc: 0.709301
[161]	train's

[227]	train's auc: 0.728981	train's online_auc: 0.723913	valid1's auc: 0.721137	valid1's online_auc: 0.714081
[228]	train's auc: 0.729113	train's online_auc: 0.724283	valid1's auc: 0.721194	valid1's online_auc: 0.714215
[229]	train's auc: 0.729196	train's online_auc: 0.724322	valid1's auc: 0.721235	valid1's online_auc: 0.714225
[230]	train's auc: 0.729284	train's online_auc: 0.72465	valid1's auc: 0.721288	valid1's online_auc: 0.714368
[231]	train's auc: 0.729411	train's online_auc: 0.724824	valid1's auc: 0.72136	valid1's online_auc: 0.714501
[232]	train's auc: 0.729521	train's online_auc: 0.724921	valid1's auc: 0.721403	valid1's online_auc: 0.714529
[233]	train's auc: 0.729555	train's online_auc: 0.724965	valid1's auc: 0.721428	valid1's online_auc: 0.714551
[234]	train's auc: 0.729624	train's online_auc: 0.725038	valid1's auc: 0.721476	valid1's online_auc: 0.714564
[235]	train's auc: 0.729671	train's online_auc: 0.725097	valid1's auc: 0.721502	valid1's online_auc: 0.714611
[236]	train'

[302]	train's auc: 0.734828	train's online_auc: 0.731249	valid1's auc: 0.724703	valid1's online_auc: 0.71724
[303]	train's auc: 0.734896	train's online_auc: 0.731323	valid1's auc: 0.724753	valid1's online_auc: 0.717275
[304]	train's auc: 0.735007	train's online_auc: 0.731369	valid1's auc: 0.724814	valid1's online_auc: 0.71729
[305]	train's auc: 0.73505	train's online_auc: 0.731407	valid1's auc: 0.724842	valid1's online_auc: 0.717311
[306]	train's auc: 0.735154	train's online_auc: 0.731531	valid1's auc: 0.724903	valid1's online_auc: 0.717376
[307]	train's auc: 0.735272	train's online_auc: 0.731639	valid1's auc: 0.724967	valid1's online_auc: 0.717432
[308]	train's auc: 0.735287	train's online_auc: 0.731661	valid1's auc: 0.724978	valid1's online_auc: 0.717444
[309]	train's auc: 0.735316	train's online_auc: 0.731695	valid1's auc: 0.724996	valid1's online_auc: 0.717444
[310]	train's auc: 0.735389	train's online_auc: 0.731749	valid1's auc: 0.725038	valid1's online_auc: 0.717489
[311]	train's

[377]	train's auc: 0.739445	train's online_auc: 0.736698	valid1's auc: 0.727366	valid1's online_auc: 0.719525
[378]	train's auc: 0.739522	train's online_auc: 0.736897	valid1's auc: 0.727394	valid1's online_auc: 0.719583
[379]	train's auc: 0.739553	train's online_auc: 0.736942	valid1's auc: 0.727417	valid1's online_auc: 0.719627
[380]	train's auc: 0.739626	train's online_auc: 0.737072	valid1's auc: 0.727472	valid1's online_auc: 0.719688
[381]	train's auc: 0.739662	train's online_auc: 0.737086	valid1's auc: 0.727492	valid1's online_auc: 0.71972
[382]	train's auc: 0.739726	train's online_auc: 0.737132	valid1's auc: 0.727521	valid1's online_auc: 0.719733
[383]	train's auc: 0.739825	train's online_auc: 0.737196	valid1's auc: 0.727569	valid1's online_auc: 0.719742
[384]	train's auc: 0.739894	train's online_auc: 0.737366	valid1's auc: 0.727587	valid1's online_auc: 0.719736
[385]	train's auc: 0.739934	train's online_auc: 0.737411	valid1's auc: 0.727627	valid1's online_auc: 0.719752
[386]	train

[452]	train's auc: 0.743361	train's online_auc: 0.741205	valid1's auc: 0.729256	valid1's online_auc: 0.721121
[453]	train's auc: 0.743446	train's online_auc: 0.741305	valid1's auc: 0.729292	valid1's online_auc: 0.721155
[454]	train's auc: 0.743513	train's online_auc: 0.741367	valid1's auc: 0.729336	valid1's online_auc: 0.721191
[455]	train's auc: 0.743539	train's online_auc: 0.741389	valid1's auc: 0.729349	valid1's online_auc: 0.721204
[456]	train's auc: 0.743603	train's online_auc: 0.74151	valid1's auc: 0.729377	valid1's online_auc: 0.721239
[457]	train's auc: 0.743654	train's online_auc: 0.741559	valid1's auc: 0.729404	valid1's online_auc: 0.721272
[458]	train's auc: 0.74369	train's online_auc: 0.741598	valid1's auc: 0.729417	valid1's online_auc: 0.721293
[459]	train's auc: 0.743748	train's online_auc: 0.741715	valid1's auc: 0.729443	valid1's online_auc: 0.721353
[460]	train's auc: 0.743803	train's online_auc: 0.741768	valid1's auc: 0.729454	valid1's online_auc: 0.721363
[461]	train'

[527]	train's auc: 0.746919	train's online_auc: 0.745492	valid1's auc: 0.730846	valid1's online_auc: 0.72262
[528]	train's auc: 0.746934	train's online_auc: 0.745506	valid1's auc: 0.730848	valid1's online_auc: 0.722619
[529]	train's auc: 0.747004	train's online_auc: 0.745656	valid1's auc: 0.730863	valid1's online_auc: 0.722603
[530]	train's auc: 0.74704	train's online_auc: 0.7457	valid1's auc: 0.730878	valid1's online_auc: 0.722623
[531]	train's auc: 0.747126	train's online_auc: 0.746051	valid1's auc: 0.730921	valid1's online_auc: 0.722782
[532]	train's auc: 0.747142	train's online_auc: 0.746064	valid1's auc: 0.730928	valid1's online_auc: 0.722781
[533]	train's auc: 0.747183	train's online_auc: 0.746086	valid1's auc: 0.730947	valid1's online_auc: 0.722788
[534]	train's auc: 0.747227	train's online_auc: 0.746146	valid1's auc: 0.730961	valid1's online_auc: 0.722793
[535]	train's auc: 0.747272	train's online_auc: 0.746187	valid1's auc: 0.730997	valid1's online_auc: 0.722827
[536]	train's 

[602]	train's auc: 0.750073	train's online_auc: 0.7495	valid1's auc: 0.73215	valid1's online_auc: 0.723931
[603]	train's auc: 0.750113	train's online_auc: 0.74965	valid1's auc: 0.732159	valid1's online_auc: 0.723942
[604]	train's auc: 0.750153	train's online_auc: 0.749679	valid1's auc: 0.732177	valid1's online_auc: 0.723947
[605]	train's auc: 0.750178	train's online_auc: 0.749707	valid1's auc: 0.732191	valid1's online_auc: 0.72395
[606]	train's auc: 0.750245	train's online_auc: 0.74978	valid1's auc: 0.732209	valid1's online_auc: 0.72397
[607]	train's auc: 0.750295	train's online_auc: 0.749804	valid1's auc: 0.732233	valid1's online_auc: 0.723974
[608]	train's auc: 0.750335	train's online_auc: 0.749896	valid1's auc: 0.732243	valid1's online_auc: 0.72397
[609]	train's auc: 0.750377	train's online_auc: 0.749939	valid1's auc: 0.732262	valid1's online_auc: 0.723991
[610]	train's auc: 0.750406	train's online_auc: 0.749995	valid1's auc: 0.732288	valid1's online_auc: 0.724036
[611]	train's auc:

[677]	train's auc: 0.752679	train's online_auc: 0.752824	valid1's auc: 0.73309	valid1's online_auc: 0.724878
[678]	train's auc: 0.752728	train's online_auc: 0.752878	valid1's auc: 0.7331	valid1's online_auc: 0.724921
[679]	train's auc: 0.752745	train's online_auc: 0.752886	valid1's auc: 0.7331	valid1's online_auc: 0.724918
[680]	train's auc: 0.752762	train's online_auc: 0.752901	valid1's auc: 0.733103	valid1's online_auc: 0.72492
[681]	train's auc: 0.752804	train's online_auc: 0.752924	valid1's auc: 0.733135	valid1's online_auc: 0.724947
[682]	train's auc: 0.752822	train's online_auc: 0.752947	valid1's auc: 0.73314	valid1's online_auc: 0.72495
[683]	train's auc: 0.75284	train's online_auc: 0.752965	valid1's auc: 0.733141	valid1's online_auc: 0.724945
[684]	train's auc: 0.752905	train's online_auc: 0.752989	valid1's auc: 0.733169	valid1's online_auc: 0.724957
[685]	train's auc: 0.752947	train's online_auc: 0.753038	valid1's auc: 0.733201	valid1's online_auc: 0.724981
[686]	train's auc: 

[752]	train's auc: 0.755118	train's online_auc: 0.755813	valid1's auc: 0.733938	valid1's online_auc: 0.725753
[753]	train's auc: 0.755158	train's online_auc: 0.755858	valid1's auc: 0.733948	valid1's online_auc: 0.725784
[754]	train's auc: 0.755207	train's online_auc: 0.755923	valid1's auc: 0.733972	valid1's online_auc: 0.725811
[755]	train's auc: 0.755236	train's online_auc: 0.755952	valid1's auc: 0.733972	valid1's online_auc: 0.725811
[756]	train's auc: 0.755273	train's online_auc: 0.755978	valid1's auc: 0.733977	valid1's online_auc: 0.725815
[757]	train's auc: 0.755289	train's online_auc: 0.755987	valid1's auc: 0.733982	valid1's online_auc: 0.725815
[758]	train's auc: 0.755316	train's online_auc: 0.756021	valid1's auc: 0.733992	valid1's online_auc: 0.725819
[759]	train's auc: 0.755353	train's online_auc: 0.756105	valid1's auc: 0.734016	valid1's online_auc: 0.725832
[760]	train's auc: 0.755423	train's online_auc: 0.75629	valid1's auc: 0.734049	valid1's online_auc: 0.725905
[761]	train

[827]	train's auc: 0.75763	train's online_auc: 0.759267	valid1's auc: 0.734766	valid1's online_auc: 0.726707
[828]	train's auc: 0.757654	train's online_auc: 0.75929	valid1's auc: 0.734779	valid1's online_auc: 0.726715
[829]	train's auc: 0.757687	train's online_auc: 0.759334	valid1's auc: 0.734785	valid1's online_auc: 0.726737
[830]	train's auc: 0.757734	train's online_auc: 0.759377	valid1's auc: 0.734793	valid1's online_auc: 0.726773
[831]	train's auc: 0.75776	train's online_auc: 0.759487	valid1's auc: 0.734802	valid1's online_auc: 0.726795
[832]	train's auc: 0.757814	train's online_auc: 0.759552	valid1's auc: 0.734806	valid1's online_auc: 0.72682
[833]	train's auc: 0.757833	train's online_auc: 0.759566	valid1's auc: 0.734824	valid1's online_auc: 0.726836
[834]	train's auc: 0.757855	train's online_auc: 0.759589	valid1's auc: 0.734829	valid1's online_auc: 0.726835
[835]	train's auc: 0.75791	train's online_auc: 0.759608	valid1's auc: 0.734831	valid1's online_auc: 0.726846
[836]	train's a

[902]	train's auc: 0.759725	train's online_auc: 0.762061	valid1's auc: 0.735332	valid1's online_auc: 0.72746
[903]	train's auc: 0.759762	train's online_auc: 0.762068	valid1's auc: 0.735344	valid1's online_auc: 0.727468
[904]	train's auc: 0.759795	train's online_auc: 0.762094	valid1's auc: 0.735355	valid1's online_auc: 0.727472
[905]	train's auc: 0.759844	train's online_auc: 0.762174	valid1's auc: 0.735375	valid1's online_auc: 0.727519
[906]	train's auc: 0.759866	train's online_auc: 0.762188	valid1's auc: 0.735374	valid1's online_auc: 0.727524
[907]	train's auc: 0.75991	train's online_auc: 0.762237	valid1's auc: 0.735389	valid1's online_auc: 0.727529
[908]	train's auc: 0.759925	train's online_auc: 0.762256	valid1's auc: 0.735386	valid1's online_auc: 0.727529
[909]	train's auc: 0.759965	train's online_auc: 0.762375	valid1's auc: 0.735402	valid1's online_auc: 0.727581
[910]	train's auc: 0.759992	train's online_auc: 0.762439	valid1's auc: 0.73541	valid1's online_auc: 0.727577
[911]	train's

[977]	train's auc: 0.761926	train's online_auc: 0.764601	valid1's auc: 0.735929	valid1's online_auc: 0.728275
[978]	train's auc: 0.761954	train's online_auc: 0.764633	valid1's auc: 0.735937	valid1's online_auc: 0.728282
[979]	train's auc: 0.761985	train's online_auc: 0.764685	valid1's auc: 0.735945	valid1's online_auc: 0.728286
[980]	train's auc: 0.762008	train's online_auc: 0.764697	valid1's auc: 0.735949	valid1's online_auc: 0.728287
[981]	train's auc: 0.762029	train's online_auc: 0.764713	valid1's auc: 0.735951	valid1's online_auc: 0.728293
[982]	train's auc: 0.76206	train's online_auc: 0.764732	valid1's auc: 0.735955	valid1's online_auc: 0.728293
[983]	train's auc: 0.762079	train's online_auc: 0.764739	valid1's auc: 0.735963	valid1's online_auc: 0.728295
[984]	train's auc: 0.762112	train's online_auc: 0.764826	valid1's auc: 0.735973	valid1's online_auc: 0.72831
[985]	train's auc: 0.76212	train's online_auc: 0.764841	valid1's auc: 0.735977	valid1's online_auc: 0.728327
[986]	train's

[1052]	train's auc: 0.763783	train's online_auc: 0.766959	valid1's auc: 0.736385	valid1's online_auc: 0.728862
[1053]	train's auc: 0.763802	train's online_auc: 0.767011	valid1's auc: 0.736386	valid1's online_auc: 0.728861
[1054]	train's auc: 0.763835	train's online_auc: 0.76704	valid1's auc: 0.73641	valid1's online_auc: 0.728882
[1055]	train's auc: 0.763845	train's online_auc: 0.76705	valid1's auc: 0.73641	valid1's online_auc: 0.728881
[1056]	train's auc: 0.763854	train's online_auc: 0.767057	valid1's auc: 0.736417	valid1's online_auc: 0.728891
[1057]	train's auc: 0.763881	train's online_auc: 0.767077	valid1's auc: 0.73642	valid1's online_auc: 0.728909
[1058]	train's auc: 0.763916	train's online_auc: 0.767142	valid1's auc: 0.736424	valid1's online_auc: 0.728926
[1059]	train's auc: 0.763938	train's online_auc: 0.767146	valid1's auc: 0.736424	valid1's online_auc: 0.728925
[1060]	train's auc: 0.763966	train's online_auc: 0.767165	valid1's auc: 0.736433	valid1's online_auc: 0.728936
[1061]

[1127]	train's auc: 0.765661	train's online_auc: 0.769135	valid1's auc: 0.736841	valid1's online_auc: 0.729321
[1128]	train's auc: 0.765698	train's online_auc: 0.769169	valid1's auc: 0.736843	valid1's online_auc: 0.729315
[1129]	train's auc: 0.765726	train's online_auc: 0.769231	valid1's auc: 0.736851	valid1's online_auc: 0.729343
[1130]	train's auc: 0.765772	train's online_auc: 0.769299	valid1's auc: 0.736858	valid1's online_auc: 0.729351
[1131]	train's auc: 0.765805	train's online_auc: 0.769343	valid1's auc: 0.736865	valid1's online_auc: 0.729361
[1132]	train's auc: 0.765828	train's online_auc: 0.769385	valid1's auc: 0.736873	valid1's online_auc: 0.729384
[1133]	train's auc: 0.765843	train's online_auc: 0.769394	valid1's auc: 0.73688	valid1's online_auc: 0.729386
[1134]	train's auc: 0.765866	train's online_auc: 0.769425	valid1's auc: 0.73688	valid1's online_auc: 0.729385
[1135]	train's auc: 0.765915	train's online_auc: 0.769452	valid1's auc: 0.736883	valid1's online_auc: 0.729393
[11

[1202]	train's auc: 0.767403	train's online_auc: 0.771328	valid1's auc: 0.737186	valid1's online_auc: 0.729788
[1203]	train's auc: 0.767429	train's online_auc: 0.771365	valid1's auc: 0.737181	valid1's online_auc: 0.729791
[1204]	train's auc: 0.767451	train's online_auc: 0.771434	valid1's auc: 0.737196	valid1's online_auc: 0.729892
[1205]	train's auc: 0.767473	train's online_auc: 0.771451	valid1's auc: 0.737202	valid1's online_auc: 0.729896
[1206]	train's auc: 0.767487	train's online_auc: 0.771462	valid1's auc: 0.737204	valid1's online_auc: 0.729892
[1207]	train's auc: 0.767525	train's online_auc: 0.771484	valid1's auc: 0.737207	valid1's online_auc: 0.729885
[1208]	train's auc: 0.767537	train's online_auc: 0.7715	valid1's auc: 0.737204	valid1's online_auc: 0.729883
[1209]	train's auc: 0.767572	train's online_auc: 0.771519	valid1's auc: 0.737223	valid1's online_auc: 0.729897
[1210]	train's auc: 0.767619	train's online_auc: 0.771581	valid1's auc: 0.73724	valid1's online_auc: 0.729904
[121

[1277]	train's auc: 0.76919	train's online_auc: 0.773572	valid1's auc: 0.737586	valid1's online_auc: 0.730144
[1278]	train's auc: 0.769208	train's online_auc: 0.773585	valid1's auc: 0.737589	valid1's online_auc: 0.730147
[1279]	train's auc: 0.769236	train's online_auc: 0.773625	valid1's auc: 0.737594	valid1's online_auc: 0.730144
[1280]	train's auc: 0.769246	train's online_auc: 0.773644	valid1's auc: 0.737598	valid1's online_auc: 0.73015
[1281]	train's auc: 0.769263	train's online_auc: 0.773656	valid1's auc: 0.7376	valid1's online_auc: 0.730152
[1282]	train's auc: 0.769292	train's online_auc: 0.773676	valid1's auc: 0.73761	valid1's online_auc: 0.730169
[1283]	train's auc: 0.76931	train's online_auc: 0.773684	valid1's auc: 0.737617	valid1's online_auc: 0.730172
[1284]	train's auc: 0.769352	train's online_auc: 0.77372	valid1's auc: 0.737624	valid1's online_auc: 0.730172
[1285]	train's auc: 0.76937	train's online_auc: 0.773731	valid1's auc: 0.737619	valid1's online_auc: 0.730167
[1286]	tr

[1352]	train's auc: 0.770787	train's online_auc: 0.77531	valid1's auc: 0.737858	valid1's online_auc: 0.730376
[1353]	train's auc: 0.77081	train's online_auc: 0.775322	valid1's auc: 0.737856	valid1's online_auc: 0.730374
[1354]	train's auc: 0.770833	train's online_auc: 0.775357	valid1's auc: 0.737857	valid1's online_auc: 0.730374
[1355]	train's auc: 0.770872	train's online_auc: 0.775389	valid1's auc: 0.737869	valid1's online_auc: 0.730395
[1356]	train's auc: 0.770897	train's online_auc: 0.775473	valid1's auc: 0.737875	valid1's online_auc: 0.730382
[1357]	train's auc: 0.770959	train's online_auc: 0.775499	valid1's auc: 0.73788	valid1's online_auc: 0.730383
[1358]	train's auc: 0.770975	train's online_auc: 0.775518	valid1's auc: 0.737881	valid1's online_auc: 0.730384
[1359]	train's auc: 0.770989	train's online_auc: 0.775528	valid1's auc: 0.73789	valid1's online_auc: 0.730384
[1360]	train's auc: 0.771016	train's online_auc: 0.77558	valid1's auc: 0.737896	valid1's online_auc: 0.730391
[1361]

[1427]	train's auc: 0.77241	train's online_auc: 0.777461	valid1's auc: 0.738154	valid1's online_auc: 0.730764
[1428]	train's auc: 0.772431	train's online_auc: 0.777479	valid1's auc: 0.738156	valid1's online_auc: 0.730766
[1429]	train's auc: 0.772442	train's online_auc: 0.777482	valid1's auc: 0.738158	valid1's online_auc: 0.730766
[1430]	train's auc: 0.772457	train's online_auc: 0.777508	valid1's auc: 0.738158	valid1's online_auc: 0.730761
[1431]	train's auc: 0.772476	train's online_auc: 0.777513	valid1's auc: 0.73816	valid1's online_auc: 0.730762
[1432]	train's auc: 0.772514	train's online_auc: 0.777606	valid1's auc: 0.738165	valid1's online_auc: 0.730785
[1433]	train's auc: 0.772541	train's online_auc: 0.777621	valid1's auc: 0.738177	valid1's online_auc: 0.730792
[1434]	train's auc: 0.772557	train's online_auc: 0.777636	valid1's auc: 0.738183	valid1's online_auc: 0.730795
[1435]	train's auc: 0.772572	train's online_auc: 0.777652	valid1's auc: 0.738186	valid1's online_auc: 0.730797
[14

[1502]	train's auc: 0.773944	train's online_auc: 0.779726	valid1's auc: 0.738389	valid1's online_auc: 0.731003
[1503]	train's auc: 0.77395	train's online_auc: 0.77973	valid1's auc: 0.738393	valid1's online_auc: 0.731005
[1504]	train's auc: 0.773959	train's online_auc: 0.779735	valid1's auc: 0.73839	valid1's online_auc: 0.730998
[1505]	train's auc: 0.773994	train's online_auc: 0.779764	valid1's auc: 0.738413	valid1's online_auc: 0.731019
[1506]	train's auc: 0.774009	train's online_auc: 0.779787	valid1's auc: 0.738418	valid1's online_auc: 0.731021
[1507]	train's auc: 0.774041	train's online_auc: 0.779815	valid1's auc: 0.738415	valid1's online_auc: 0.731016
[1508]	train's auc: 0.774053	train's online_auc: 0.77982	valid1's auc: 0.73842	valid1's online_auc: 0.731019
[1509]	train's auc: 0.774062	train's online_auc: 0.779829	valid1's auc: 0.738425	valid1's online_auc: 0.73102
[1510]	train's auc: 0.774076	train's online_auc: 0.779852	valid1's auc: 0.738425	valid1's online_auc: 0.73101
[1511]	t

[1577]	train's auc: 0.775386	train's online_auc: 0.781689	valid1's auc: 0.738664	valid1's online_auc: 0.731375
[1578]	train's auc: 0.775402	train's online_auc: 0.7817	valid1's auc: 0.738655	valid1's online_auc: 0.731361
[1579]	train's auc: 0.775432	train's online_auc: 0.781717	valid1's auc: 0.738654	valid1's online_auc: 0.731361
[1580]	train's auc: 0.775458	train's online_auc: 0.781741	valid1's auc: 0.738659	valid1's online_auc: 0.731361
[1581]	train's auc: 0.775484	train's online_auc: 0.781787	valid1's auc: 0.738669	valid1's online_auc: 0.731385
[1582]	train's auc: 0.775515	train's online_auc: 0.781795	valid1's auc: 0.738668	valid1's online_auc: 0.731385
[1583]	train's auc: 0.775527	train's online_auc: 0.781814	valid1's auc: 0.73867	valid1's online_auc: 0.731381
[1584]	train's auc: 0.77555	train's online_auc: 0.78183	valid1's auc: 0.738673	valid1's online_auc: 0.731377
[1585]	train's auc: 0.775587	train's online_auc: 0.78187	valid1's auc: 0.738673	valid1's online_auc: 0.731374
[1586]	

[1652]	train's auc: 0.776849	train's online_auc: 0.783604	valid1's auc: 0.738833	valid1's online_auc: 0.731585
[1653]	train's auc: 0.776872	train's online_auc: 0.783642	valid1's auc: 0.738835	valid1's online_auc: 0.731589
[1654]	train's auc: 0.776882	train's online_auc: 0.783671	valid1's auc: 0.738835	valid1's online_auc: 0.731581
[1655]	train's auc: 0.776915	train's online_auc: 0.783698	valid1's auc: 0.738843	valid1's online_auc: 0.731575
[1656]	train's auc: 0.776928	train's online_auc: 0.783711	valid1's auc: 0.738841	valid1's online_auc: 0.73157
[1657]	train's auc: 0.776943	train's online_auc: 0.783723	valid1's auc: 0.738844	valid1's online_auc: 0.731573
[1658]	train's auc: 0.776968	train's online_auc: 0.783773	valid1's auc: 0.73884	valid1's online_auc: 0.731566
[1659]	train's auc: 0.77699	train's online_auc: 0.783791	valid1's auc: 0.738834	valid1's online_auc: 0.731568
[1660]	train's auc: 0.777007	train's online_auc: 0.783835	valid1's auc: 0.738839	valid1's online_auc: 0.731593
[166

[1727]	train's auc: 0.778328	train's online_auc: 0.785418	valid1's auc: 0.739053	valid1's online_auc: 0.731966
[1728]	train's auc: 0.778346	train's online_auc: 0.785433	valid1's auc: 0.73905	valid1's online_auc: 0.731963
[1729]	train's auc: 0.778359	train's online_auc: 0.785445	valid1's auc: 0.739051	valid1's online_auc: 0.731956
[1730]	train's auc: 0.778368	train's online_auc: 0.785458	valid1's auc: 0.739051	valid1's online_auc: 0.732009
[1731]	train's auc: 0.778383	train's online_auc: 0.785466	valid1's auc: 0.73905	valid1's online_auc: 0.732003
[1732]	train's auc: 0.778392	train's online_auc: 0.785473	valid1's auc: 0.739048	valid1's online_auc: 0.732006
[1733]	train's auc: 0.778408	train's online_auc: 0.785485	valid1's auc: 0.73905	valid1's online_auc: 0.732002
[1734]	train's auc: 0.778432	train's online_auc: 0.785506	valid1's auc: 0.739059	valid1's online_auc: 0.732002
[1735]	train's auc: 0.778439	train's online_auc: 0.785515	valid1's auc: 0.739063	valid1's online_auc: 0.732008
[173

[1802]	train's auc: 0.779473	train's online_auc: 0.786644	valid1's auc: 0.739152	valid1's online_auc: 0.732112
[1803]	train's auc: 0.779485	train's online_auc: 0.786659	valid1's auc: 0.739155	valid1's online_auc: 0.732119
[1804]	train's auc: 0.779494	train's online_auc: 0.786661	valid1's auc: 0.73915	valid1's online_auc: 0.732119
[1805]	train's auc: 0.779519	train's online_auc: 0.78669	valid1's auc: 0.739147	valid1's online_auc: 0.73211
[1806]	train's auc: 0.779555	train's online_auc: 0.786721	valid1's auc: 0.739152	valid1's online_auc: 0.732115
[1807]	train's auc: 0.779573	train's online_auc: 0.786736	valid1's auc: 0.739169	valid1's online_auc: 0.732123
[1808]	train's auc: 0.779583	train's online_auc: 0.78674	valid1's auc: 0.739171	valid1's online_auc: 0.732124
[1809]	train's auc: 0.779592	train's online_auc: 0.786756	valid1's auc: 0.739171	valid1's online_auc: 0.732124
[1810]	train's auc: 0.779614	train's online_auc: 0.786771	valid1's auc: 0.739172	valid1's online_auc: 0.732124
[1811

[1877]	train's auc: 0.780846	train's online_auc: 0.788422	valid1's auc: 0.739362	valid1's online_auc: 0.732311
[1878]	train's auc: 0.780867	train's online_auc: 0.788442	valid1's auc: 0.739362	valid1's online_auc: 0.732308
[1879]	train's auc: 0.780879	train's online_auc: 0.788457	valid1's auc: 0.739361	valid1's online_auc: 0.732312
[1880]	train's auc: 0.780909	train's online_auc: 0.788476	valid1's auc: 0.739358	valid1's online_auc: 0.7323
[1881]	train's auc: 0.780928	train's online_auc: 0.788488	valid1's auc: 0.739359	valid1's online_auc: 0.732304
[1882]	train's auc: 0.780942	train's online_auc: 0.788507	valid1's auc: 0.739363	valid1's online_auc: 0.732304
[1883]	train's auc: 0.780951	train's online_auc: 0.788517	valid1's auc: 0.739363	valid1's online_auc: 0.732304
[1884]	train's auc: 0.780963	train's online_auc: 0.788532	valid1's auc: 0.73937	valid1's online_auc: 0.732306
[1885]	train's auc: 0.780967	train's online_auc: 0.788539	valid1's auc: 0.739368	valid1's online_auc: 0.732302
[188

[1952]	train's auc: 0.782119	train's online_auc: 0.79012	valid1's auc: 0.739482	valid1's online_auc: 0.732387
[1953]	train's auc: 0.782131	train's online_auc: 0.790137	valid1's auc: 0.739481	valid1's online_auc: 0.732386
[1954]	train's auc: 0.78214	train's online_auc: 0.790143	valid1's auc: 0.739483	valid1's online_auc: 0.732387
[1955]	train's auc: 0.782152	train's online_auc: 0.790151	valid1's auc: 0.739481	valid1's online_auc: 0.732385
[1956]	train's auc: 0.782168	train's online_auc: 0.790154	valid1's auc: 0.739484	valid1's online_auc: 0.732386
[1957]	train's auc: 0.782176	train's online_auc: 0.790161	valid1's auc: 0.739484	valid1's online_auc: 0.732384
[1958]	train's auc: 0.782184	train's online_auc: 0.790164	valid1's auc: 0.739485	valid1's online_auc: 0.732391
[1959]	train's auc: 0.78221	train's online_auc: 0.790195	valid1's auc: 0.739489	valid1's online_auc: 0.732386
[1960]	train's auc: 0.782232	train's online_auc: 0.790202	valid1's auc: 0.739491	valid1's online_auc: 0.732384
[196

[2027]	train's auc: 0.783479	train's online_auc: 0.791651	valid1's auc: 0.739649	valid1's online_auc: 0.732601
[2028]	train's auc: 0.783504	train's online_auc: 0.791692	valid1's auc: 0.739654	valid1's online_auc: 0.732613
[2029]	train's auc: 0.783532	train's online_auc: 0.791704	valid1's auc: 0.739655	valid1's online_auc: 0.732609
[2030]	train's auc: 0.783539	train's online_auc: 0.791711	valid1's auc: 0.739653	valid1's online_auc: 0.732606
[2031]	train's auc: 0.783552	train's online_auc: 0.791741	valid1's auc: 0.739656	valid1's online_auc: 0.732618
[2032]	train's auc: 0.783565	train's online_auc: 0.791781	valid1's auc: 0.739656	valid1's online_auc: 0.73261
[2033]	train's auc: 0.783575	train's online_auc: 0.791797	valid1's auc: 0.739654	valid1's online_auc: 0.732606
[2034]	train's auc: 0.783589	train's online_auc: 0.791805	valid1's auc: 0.739663	valid1's online_auc: 0.732608
[2035]	train's auc: 0.783599	train's online_auc: 0.791821	valid1's auc: 0.739661	valid1's online_auc: 0.732617
[2

[2102]	train's auc: 0.784793	train's online_auc: 0.793056	valid1's auc: 0.739719	valid1's online_auc: 0.732657
[2103]	train's auc: 0.784807	train's online_auc: 0.793072	valid1's auc: 0.73972	valid1's online_auc: 0.732658
[2104]	train's auc: 0.784818	train's online_auc: 0.793078	valid1's auc: 0.739718	valid1's online_auc: 0.732658
[2105]	train's auc: 0.784826	train's online_auc: 0.793082	valid1's auc: 0.73972	valid1's online_auc: 0.732659
[2106]	train's auc: 0.784846	train's online_auc: 0.793097	valid1's auc: 0.739726	valid1's online_auc: 0.732664
[2107]	train's auc: 0.784861	train's online_auc: 0.793111	valid1's auc: 0.739729	valid1's online_auc: 0.732664
[2108]	train's auc: 0.784869	train's online_auc: 0.793118	valid1's auc: 0.73973	valid1's online_auc: 0.732662
[2109]	train's auc: 0.784885	train's online_auc: 0.793124	valid1's auc: 0.739734	valid1's online_auc: 0.732667
[2110]	train's auc: 0.784898	train's online_auc: 0.793137	valid1's auc: 0.739734	valid1's online_auc: 0.732671
[211

In [13]:
# # these will cause LightGBM bug I don't know how to fix. Just skip it
# log_file = 'v2.feature_importance.csv'
# log_path = os.path.join(log_folder, log_file)

# df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
# df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
# df_feature_importance.to_csv(log_path, index=False)
# df_feature_importance.head(30)

In [14]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

[15:21:58] Finish making prediction on validation set. △M: +520.45MB. △T: 1.3 minutes.


In [15]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.732732
Simple AUC: 0.739721


In [16]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[15:22:22] Finish cleaning memory. △M: -17.62GB. △T: 8.7 seconds.


In [17]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
    
with pu.profiler("trimming testing set"):    
    X_test = X_test[:, mask]
    gc.collect()

print("Test Data Shape (After masking): {}".format(X_test.shape))
# print("Test Column Numbers: {}".format(len(cols_test)))

[15:23:44] Finish loading testing data. △M: +1.6GB. △T: 1.2 minutes.
[15:24:12] Finish trimming testing set. △M: -1.22GB. △T: 27.6 seconds.
Test Data Shape (After masking): (2265989, 1791142)


In [18]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[15:25:56] Finish making prediction on testing set. △M: -5.14GB. △T: 1.5 minutes.


In [19]:
subm_folder = '../../../subm/lgbm/0512_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)