In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]

# instantiate data loader
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
# cross_bin_loader = dm.build_data("cross", "binary_v1")
cross_bin_loaders = (dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
                     for ad_feat_name, user_feat_name in pairs)
# union_loader = dp.DataUnion(bin_loader, cross_bin_loader
union_loader = dp.DataUnion(bin_loader, *cross_bin_loaders)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[07:25:15] Finish loading training data. △M: +3.45GB. △T: 1.2 minutes.
Train Data Shape: (8798814, 570730)
Train Column Numbers: 570730


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [6]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

print("Training Set Size (Before masking): {}".format(X_train.shape))
print("Validation Set Size (Before masking): {}".format(X_valid.shape))
    
min_df = 3  # there is no such things for v2
with pu.profiler("trimming train/valid set"):    
    mask = (X_train.getnnz(axis=0) >= min_df)  # threshold to be extendee
    cols_masked = list(compress(cols_train, mask))
    X_train = X_train[:, mask]
    X_valid = X_valid[:, mask]
    gc.collect()
    
print("Training Set Size (After masking): {}".format(X_train.shape))
print("Validation Set Size (After masking): {}".format(X_valid.shape))

[07:26:48] Finish splitting train/valid set. △M: +140.79MB. △T: 19.3 seconds.
Training Set Size (Before masking): (7039050, 570730)
Validation Set Size (Before masking): (1759764, 570730)
[07:27:19] Finish trimming train/valid set. △M: +20.0MB. △T: 31.6 seconds.
Training Set Size (After masking): (7039050, 395570)
Validation Set Size (After masking): (1759764, 395570)


In [7]:
with pu.profiler("preparing LightGBM data"):
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[07:27:40] Finish preparing LightGBM data. △M: +5.4GB. △T: 4.5 seconds.


In [9]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0509/')
log_file = 'v3.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [10]:
# v2 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 15,
#     'num_leaves': 120,
#     'learning_rate': 0.15,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'verbose': 0
# }
# v3 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 8,
    'num_leaves': 120,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'verbose': 0
}
num_rounds = 2000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.616603	train's online_auc: 0.61631	valid1's auc: 0.616343	valid1's online_auc: 0.617588
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.624	train's online_auc: 0.625027	valid1's auc: 0.624135	valid1's online_auc: 0.62682
[3]	train's auc: 0.655136	train's online_auc: 0.653282	valid1's auc: 0.654571	valid1's online_auc: 0.654458
[4]	train's auc: 0.655819	train's online_auc: 0.655065	valid1's auc: 0.655488	valid1's online_auc: 0.655267
[5]	train's auc: 0.6613	train's online_auc: 0.656422	valid1's auc: 0.660838	valid1's online_auc: 0.656562
[6]	train's auc: 0.663675	train's online_auc: 0.659062	valid1's auc: 0.662882	valid1's online_auc: 0.659399
[7]	train's auc: 0.66405	train's online_auc: 0.659037	valid1's auc: 0.66339	valid1's online_auc: 0.659291
[8]	train's auc: 0.664463	train's online_auc: 0.660508	valid1's auc: 0.663822	valid1's online_auc: 0.660369
[9]	train's auc: 0.665678	train's online_auc: 0.662032	valid1's auc: 0.664839	valid

[77]	train's auc: 0.717307	train's online_auc: 0.712066	valid1's auc: 0.710988	valid1's online_auc: 0.70613
[78]	train's auc: 0.717498	train's online_auc: 0.712247	valid1's auc: 0.711101	valid1's online_auc: 0.706241
[79]	train's auc: 0.717784	train's online_auc: 0.712612	valid1's auc: 0.711315	valid1's online_auc: 0.706482
[80]	train's auc: 0.718282	train's online_auc: 0.713201	valid1's auc: 0.711701	valid1's online_auc: 0.706928
[81]	train's auc: 0.718541	train's online_auc: 0.71354	valid1's auc: 0.711906	valid1's online_auc: 0.707197
[82]	train's auc: 0.718738	train's online_auc: 0.713723	valid1's auc: 0.712013	valid1's online_auc: 0.707272
[83]	train's auc: 0.719168	train's online_auc: 0.714142	valid1's auc: 0.712345	valid1's online_auc: 0.707557
[84]	train's auc: 0.719406	train's online_auc: 0.714352	valid1's auc: 0.712478	valid1's online_auc: 0.70767
[85]	train's auc: 0.719691	train's online_auc: 0.714654	valid1's auc: 0.712594	valid1's online_auc: 0.707786
[86]	train's auc: 0.72

[153]	train's auc: 0.733264	train's online_auc: 0.728489	valid1's auc: 0.721546	valid1's online_auc: 0.715483
[154]	train's auc: 0.733412	train's online_auc: 0.728615	valid1's auc: 0.721658	valid1's online_auc: 0.715594
[155]	train's auc: 0.733797	train's online_auc: 0.729124	valid1's auc: 0.721997	valid1's online_auc: 0.715988
[156]	train's auc: 0.733883	train's online_auc: 0.729217	valid1's auc: 0.72205	valid1's online_auc: 0.716044
[157]	train's auc: 0.733967	train's online_auc: 0.729328	valid1's auc: 0.722097	valid1's online_auc: 0.716098
[158]	train's auc: 0.734036	train's online_auc: 0.729478	valid1's auc: 0.722135	valid1's online_auc: 0.716172
[159]	train's auc: 0.734142	train's online_auc: 0.729554	valid1's auc: 0.722201	valid1's online_auc: 0.7162
[160]	train's auc: 0.734312	train's online_auc: 0.729695	valid1's auc: 0.722308	valid1's online_auc: 0.716309
[161]	train's auc: 0.734562	train's online_auc: 0.729865	valid1's auc: 0.722483	valid1's online_auc: 0.716401
[162]	train's

[228]	train's auc: 0.743098	train's online_auc: 0.740026	valid1's auc: 0.727167	valid1's online_auc: 0.721055
[229]	train's auc: 0.743162	train's online_auc: 0.740123	valid1's auc: 0.727208	valid1's online_auc: 0.72111
[230]	train's auc: 0.743227	train's online_auc: 0.74024	valid1's auc: 0.727233	valid1's online_auc: 0.721152
[231]	train's auc: 0.743349	train's online_auc: 0.740404	valid1's auc: 0.727318	valid1's online_auc: 0.721233
[232]	train's auc: 0.743431	train's online_auc: 0.7405	valid1's auc: 0.727369	valid1's online_auc: 0.721301
[233]	train's auc: 0.743488	train's online_auc: 0.740569	valid1's auc: 0.727409	valid1's online_auc: 0.721339
[234]	train's auc: 0.74358	train's online_auc: 0.740627	valid1's auc: 0.727458	valid1's online_auc: 0.721389
[235]	train's auc: 0.743646	train's online_auc: 0.740761	valid1's auc: 0.727484	valid1's online_auc: 0.721509
[236]	train's auc: 0.743722	train's online_auc: 0.740801	valid1's auc: 0.727525	valid1's online_auc: 0.721558
[237]	train's a

[303]	train's auc: 0.749122	train's online_auc: 0.747553	valid1's auc: 0.729939	valid1's online_auc: 0.724063
[304]	train's auc: 0.749196	train's online_auc: 0.747567	valid1's auc: 0.729998	valid1's online_auc: 0.724084
[305]	train's auc: 0.749381	train's online_auc: 0.74783	valid1's auc: 0.730092	valid1's online_auc: 0.724246
[306]	train's auc: 0.749446	train's online_auc: 0.747906	valid1's auc: 0.730113	valid1's online_auc: 0.724268
[307]	train's auc: 0.749523	train's online_auc: 0.748101	valid1's auc: 0.730152	valid1's online_auc: 0.724393
[308]	train's auc: 0.749607	train's online_auc: 0.748167	valid1's auc: 0.730202	valid1's online_auc: 0.724431
[309]	train's auc: 0.749686	train's online_auc: 0.748226	valid1's auc: 0.730236	valid1's online_auc: 0.724433
[310]	train's auc: 0.749807	train's online_auc: 0.748375	valid1's auc: 0.730262	valid1's online_auc: 0.724464
[311]	train's auc: 0.749963	train's online_auc: 0.748519	valid1's auc: 0.730338	valid1's online_auc: 0.724528
[312]	train

[378]	train's auc: 0.755179	train's online_auc: 0.755361	valid1's auc: 0.732333	valid1's online_auc: 0.727102
[379]	train's auc: 0.755221	train's online_auc: 0.755398	valid1's auc: 0.732351	valid1's online_auc: 0.727102
[380]	train's auc: 0.755261	train's online_auc: 0.755443	valid1's auc: 0.732364	valid1's online_auc: 0.727099
[381]	train's auc: 0.755307	train's online_auc: 0.755472	valid1's auc: 0.732373	valid1's online_auc: 0.727099
[382]	train's auc: 0.755449	train's online_auc: 0.755728	valid1's auc: 0.732415	valid1's online_auc: 0.727195
[383]	train's auc: 0.755561	train's online_auc: 0.75584	valid1's auc: 0.732448	valid1's online_auc: 0.727208
[384]	train's auc: 0.755605	train's online_auc: 0.755896	valid1's auc: 0.732464	valid1's online_auc: 0.727221
[385]	train's auc: 0.755636	train's online_auc: 0.755946	valid1's auc: 0.73249	valid1's online_auc: 0.72724
[386]	train's auc: 0.755709	train's online_auc: 0.756029	valid1's auc: 0.732518	valid1's online_auc: 0.72728
[387]	train's 

[453]	train's auc: 0.760143	train's online_auc: 0.761336	valid1's auc: 0.734006	valid1's online_auc: 0.728833
[454]	train's auc: 0.760197	train's online_auc: 0.761381	valid1's auc: 0.734011	valid1's online_auc: 0.728838
[455]	train's auc: 0.760217	train's online_auc: 0.761393	valid1's auc: 0.734024	valid1's online_auc: 0.728842
[456]	train's auc: 0.760268	train's online_auc: 0.761435	valid1's auc: 0.73402	valid1's online_auc: 0.728864
[457]	train's auc: 0.760324	train's online_auc: 0.761667	valid1's auc: 0.734035	valid1's online_auc: 0.728877
[458]	train's auc: 0.760423	train's online_auc: 0.761808	valid1's auc: 0.734081	valid1's online_auc: 0.72892
[459]	train's auc: 0.760455	train's online_auc: 0.761828	valid1's auc: 0.734094	valid1's online_auc: 0.728929
[460]	train's auc: 0.76052	train's online_auc: 0.761883	valid1's auc: 0.734121	valid1's online_auc: 0.728947
[461]	train's auc: 0.760581	train's online_auc: 0.761957	valid1's auc: 0.734127	valid1's online_auc: 0.72895
[462]	train's 

[528]	train's auc: 0.764399	train's online_auc: 0.767388	valid1's auc: 0.735193	valid1's online_auc: 0.730134
[529]	train's auc: 0.764435	train's online_auc: 0.767433	valid1's auc: 0.735212	valid1's online_auc: 0.730148
[530]	train's auc: 0.764517	train's online_auc: 0.767597	valid1's auc: 0.735212	valid1's online_auc: 0.730158
[531]	train's auc: 0.764572	train's online_auc: 0.767872	valid1's auc: 0.735225	valid1's online_auc: 0.730234
[532]	train's auc: 0.764638	train's online_auc: 0.767926	valid1's auc: 0.735264	valid1's online_auc: 0.730273
[533]	train's auc: 0.764772	train's online_auc: 0.768025	valid1's auc: 0.735366	valid1's online_auc: 0.730425
[534]	train's auc: 0.764835	train's online_auc: 0.768081	valid1's auc: 0.735365	valid1's online_auc: 0.730418
[535]	train's auc: 0.764967	train's online_auc: 0.768176	valid1's auc: 0.735384	valid1's online_auc: 0.73043
[536]	train's auc: 0.764997	train's online_auc: 0.768223	valid1's auc: 0.735384	valid1's online_auc: 0.730433
[537]	train

[603]	train's auc: 0.768129	train's online_auc: 0.77213	valid1's auc: 0.736234	valid1's online_auc: 0.73138
[604]	train's auc: 0.768168	train's online_auc: 0.772216	valid1's auc: 0.736248	valid1's online_auc: 0.731405
[605]	train's auc: 0.768209	train's online_auc: 0.772274	valid1's auc: 0.736264	valid1's online_auc: 0.731415
[606]	train's auc: 0.768247	train's online_auc: 0.772293	valid1's auc: 0.736259	valid1's online_auc: 0.731408
[607]	train's auc: 0.768282	train's online_auc: 0.772328	valid1's auc: 0.736258	valid1's online_auc: 0.73141
[608]	train's auc: 0.76833	train's online_auc: 0.772368	valid1's auc: 0.736261	valid1's online_auc: 0.731417
[609]	train's auc: 0.768386	train's online_auc: 0.772404	valid1's auc: 0.736262	valid1's online_auc: 0.731406
[610]	train's auc: 0.768443	train's online_auc: 0.772443	valid1's auc: 0.736282	valid1's online_auc: 0.731442
[611]	train's auc: 0.768475	train's online_auc: 0.772474	valid1's auc: 0.736292	valid1's online_auc: 0.73144
[612]	train's a

[678]	train's auc: 0.771617	train's online_auc: 0.776694	valid1's auc: 0.73713	valid1's online_auc: 0.732516
[679]	train's auc: 0.771673	train's online_auc: 0.776749	valid1's auc: 0.737136	valid1's online_auc: 0.732517
[680]	train's auc: 0.771691	train's online_auc: 0.776769	valid1's auc: 0.737147	valid1's online_auc: 0.732527
[681]	train's auc: 0.77174	train's online_auc: 0.776825	valid1's auc: 0.73716	valid1's online_auc: 0.732548
[682]	train's auc: 0.77178	train's online_auc: 0.776877	valid1's auc: 0.737175	valid1's online_auc: 0.732543
[683]	train's auc: 0.771823	train's online_auc: 0.777053	valid1's auc: 0.737186	valid1's online_auc: 0.732603
[684]	train's auc: 0.771883	train's online_auc: 0.777108	valid1's auc: 0.737185	valid1's online_auc: 0.732614
[685]	train's auc: 0.7719	train's online_auc: 0.777132	valid1's auc: 0.737185	valid1's online_auc: 0.732611
[686]	train's auc: 0.771939	train's online_auc: 0.777189	valid1's auc: 0.737194	valid1's online_auc: 0.732639
[687]	train's au

[753]	train's auc: 0.77477	train's online_auc: 0.781445	valid1's auc: 0.737576	valid1's online_auc: 0.733111
[754]	train's auc: 0.774814	train's online_auc: 0.781519	valid1's auc: 0.737589	valid1's online_auc: 0.733147
[755]	train's auc: 0.774845	train's online_auc: 0.781619	valid1's auc: 0.737584	valid1's online_auc: 0.733117
[756]	train's auc: 0.774867	train's online_auc: 0.781639	valid1's auc: 0.737599	valid1's online_auc: 0.733129
[757]	train's auc: 0.774957	train's online_auc: 0.781697	valid1's auc: 0.737622	valid1's online_auc: 0.733145
[758]	train's auc: 0.774984	train's online_auc: 0.781785	valid1's auc: 0.737624	valid1's online_auc: 0.733131
[759]	train's auc: 0.775025	train's online_auc: 0.781818	valid1's auc: 0.737625	valid1's online_auc: 0.733133
[760]	train's auc: 0.775061	train's online_auc: 0.781871	valid1's auc: 0.737634	valid1's online_auc: 0.733141
[761]	train's auc: 0.775107	train's online_auc: 0.781913	valid1's auc: 0.737648	valid1's online_auc: 0.733148
[762]	train

[828]	train's auc: 0.777751	train's online_auc: 0.78539	valid1's auc: 0.738086	valid1's online_auc: 0.733744
[829]	train's auc: 0.777791	train's online_auc: 0.785417	valid1's auc: 0.738088	valid1's online_auc: 0.733739
[830]	train's auc: 0.777816	train's online_auc: 0.785441	valid1's auc: 0.738083	valid1's online_auc: 0.733715
[831]	train's auc: 0.777869	train's online_auc: 0.785496	valid1's auc: 0.738094	valid1's online_auc: 0.733721
[832]	train's auc: 0.777889	train's online_auc: 0.785516	valid1's auc: 0.738093	valid1's online_auc: 0.733722
[833]	train's auc: 0.777921	train's online_auc: 0.785551	valid1's auc: 0.738098	valid1's online_auc: 0.733725
[834]	train's auc: 0.777992	train's online_auc: 0.785687	valid1's auc: 0.738124	valid1's online_auc: 0.733771
[835]	train's auc: 0.778033	train's online_auc: 0.785723	valid1's auc: 0.738137	valid1's online_auc: 0.733788
[836]	train's auc: 0.778055	train's online_auc: 0.785766	valid1's auc: 0.73814	valid1's online_auc: 0.733787
[837]	train'

[903]	train's auc: 0.780542	train's online_auc: 0.788883	valid1's auc: 0.738533	valid1's online_auc: 0.734298
[904]	train's auc: 0.78057	train's online_auc: 0.788939	valid1's auc: 0.738536	valid1's online_auc: 0.734314
[905]	train's auc: 0.780597	train's online_auc: 0.78895	valid1's auc: 0.738565	valid1's online_auc: 0.734314
[906]	train's auc: 0.780649	train's online_auc: 0.789008	valid1's auc: 0.738567	valid1's online_auc: 0.73431
[907]	train's auc: 0.780685	train's online_auc: 0.789064	valid1's auc: 0.73857	valid1's online_auc: 0.734308
[908]	train's auc: 0.780725	train's online_auc: 0.789101	valid1's auc: 0.738581	valid1's online_auc: 0.734319
[909]	train's auc: 0.78074	train's online_auc: 0.789116	valid1's auc: 0.738574	valid1's online_auc: 0.734316
[910]	train's auc: 0.780753	train's online_auc: 0.789158	valid1's auc: 0.738573	valid1's online_auc: 0.734315
[911]	train's auc: 0.780816	train's online_auc: 0.78919	valid1's auc: 0.738587	valid1's online_auc: 0.734361
[912]	train's au

[978]	train's auc: 0.783152	train's online_auc: 0.792142	valid1's auc: 0.738914	valid1's online_auc: 0.734745
[979]	train's auc: 0.78319	train's online_auc: 0.792158	valid1's auc: 0.738909	valid1's online_auc: 0.734745
[980]	train's auc: 0.783212	train's online_auc: 0.792187	valid1's auc: 0.738911	valid1's online_auc: 0.734743
[981]	train's auc: 0.783259	train's online_auc: 0.792225	valid1's auc: 0.738916	valid1's online_auc: 0.734732
[982]	train's auc: 0.783282	train's online_auc: 0.792245	valid1's auc: 0.738929	valid1's online_auc: 0.734738
[983]	train's auc: 0.783321	train's online_auc: 0.792284	valid1's auc: 0.73893	valid1's online_auc: 0.734732
[984]	train's auc: 0.783348	train's online_auc: 0.792317	valid1's auc: 0.738925	valid1's online_auc: 0.734728
[985]	train's auc: 0.783387	train's online_auc: 0.792356	valid1's auc: 0.738919	valid1's online_auc: 0.734721
[986]	train's auc: 0.78341	train's online_auc: 0.792375	valid1's auc: 0.738919	valid1's online_auc: 0.734724
[987]	train's

[1053]	train's auc: 0.785498	train's online_auc: 0.795006	valid1's auc: 0.739149	valid1's online_auc: 0.734858
[1054]	train's auc: 0.785546	train's online_auc: 0.795034	valid1's auc: 0.739156	valid1's online_auc: 0.73485
[1055]	train's auc: 0.785586	train's online_auc: 0.795147	valid1's auc: 0.739167	valid1's online_auc: 0.734871
[1056]	train's auc: 0.785604	train's online_auc: 0.795198	valid1's auc: 0.739173	valid1's online_auc: 0.734876
[1057]	train's auc: 0.785645	train's online_auc: 0.795256	valid1's auc: 0.739176	valid1's online_auc: 0.734864
[1058]	train's auc: 0.785674	train's online_auc: 0.795288	valid1's auc: 0.739175	valid1's online_auc: 0.734865
[1059]	train's auc: 0.785715	train's online_auc: 0.795324	valid1's auc: 0.739191	valid1's online_auc: 0.734864
[1060]	train's auc: 0.785746	train's online_auc: 0.795359	valid1's auc: 0.739181	valid1's online_auc: 0.73486
[1061]	train's auc: 0.785768	train's online_auc: 0.795373	valid1's auc: 0.739181	valid1's online_auc: 0.73486
[106

[1128]	train's auc: 0.787785	train's online_auc: 0.797967	valid1's auc: 0.739529	valid1's online_auc: 0.735335
[1129]	train's auc: 0.787804	train's online_auc: 0.797987	valid1's auc: 0.739525	valid1's online_auc: 0.73533
[1130]	train's auc: 0.787834	train's online_auc: 0.798015	valid1's auc: 0.739531	valid1's online_auc: 0.735338
[1131]	train's auc: 0.787852	train's online_auc: 0.798037	valid1's auc: 0.739525	valid1's online_auc: 0.735335
[1132]	train's auc: 0.787886	train's online_auc: 0.798085	valid1's auc: 0.739521	valid1's online_auc: 0.735329
[1133]	train's auc: 0.787901	train's online_auc: 0.798099	valid1's auc: 0.739524	valid1's online_auc: 0.735342
[1134]	train's auc: 0.787923	train's online_auc: 0.798123	valid1's auc: 0.739523	valid1's online_auc: 0.735347
[1135]	train's auc: 0.787968	train's online_auc: 0.798197	valid1's auc: 0.739522	valid1's online_auc: 0.735336
[1136]	train's auc: 0.78799	train's online_auc: 0.798214	valid1's auc: 0.73952	valid1's online_auc: 0.735342
[113

[1203]	train's auc: 0.789906	train's online_auc: 0.800547	valid1's auc: 0.739757	valid1's online_auc: 0.735684
[1204]	train's auc: 0.789933	train's online_auc: 0.800571	valid1's auc: 0.739756	valid1's online_auc: 0.73569
[1205]	train's auc: 0.789951	train's online_auc: 0.800591	valid1's auc: 0.739757	valid1's online_auc: 0.735702
[1206]	train's auc: 0.789995	train's online_auc: 0.800615	valid1's auc: 0.739758	valid1's online_auc: 0.735698
[1207]	train's auc: 0.790012	train's online_auc: 0.800635	valid1's auc: 0.739764	valid1's online_auc: 0.735715
[1208]	train's auc: 0.790041	train's online_auc: 0.800668	valid1's auc: 0.739774	valid1's online_auc: 0.735724
[1209]	train's auc: 0.790062	train's online_auc: 0.800692	valid1's auc: 0.739774	valid1's online_auc: 0.735729
[1210]	train's auc: 0.790113	train's online_auc: 0.800712	valid1's auc: 0.739791	valid1's online_auc: 0.735725
[1211]	train's auc: 0.790131	train's online_auc: 0.80073	valid1's auc: 0.739795	valid1's online_auc: 0.735728
[12

[1278]	train's auc: 0.79197	train's online_auc: 0.803119	valid1's auc: 0.739894	valid1's online_auc: 0.735846
[1279]	train's auc: 0.792	train's online_auc: 0.803164	valid1's auc: 0.739889	valid1's online_auc: 0.73584
[1280]	train's auc: 0.79202	train's online_auc: 0.803173	valid1's auc: 0.7399	valid1's online_auc: 0.735833
[1281]	train's auc: 0.792055	train's online_auc: 0.803272	valid1's auc: 0.739888	valid1's online_auc: 0.735782
[1282]	train's auc: 0.792084	train's online_auc: 0.803314	valid1's auc: 0.739897	valid1's online_auc: 0.735805
[1283]	train's auc: 0.792119	train's online_auc: 0.803341	valid1's auc: 0.739906	valid1's online_auc: 0.735806
[1284]	train's auc: 0.792163	train's online_auc: 0.803382	valid1's auc: 0.739905	valid1's online_auc: 0.73581
[1285]	train's auc: 0.792187	train's online_auc: 0.803396	valid1's auc: 0.739907	valid1's online_auc: 0.735802
[1286]	train's auc: 0.792221	train's online_auc: 0.803476	valid1's auc: 0.739916	valid1's online_auc: 0.735854
[1287]	tra

[1353]	train's auc: 0.793937	train's online_auc: 0.805445	valid1's auc: 0.740161	valid1's online_auc: 0.736076
[1354]	train's auc: 0.793972	train's online_auc: 0.805534	valid1's auc: 0.740164	valid1's online_auc: 0.736083
[1355]	train's auc: 0.793992	train's online_auc: 0.805547	valid1's auc: 0.740158	valid1's online_auc: 0.736086
[1356]	train's auc: 0.794014	train's online_auc: 0.805585	valid1's auc: 0.740155	valid1's online_auc: 0.736077
[1357]	train's auc: 0.794055	train's online_auc: 0.805639	valid1's auc: 0.740152	valid1's online_auc: 0.736072
[1358]	train's auc: 0.794088	train's online_auc: 0.805678	valid1's auc: 0.740155	valid1's online_auc: 0.736081
[1359]	train's auc: 0.794098	train's online_auc: 0.805683	valid1's auc: 0.74016	valid1's online_auc: 0.736085
[1360]	train's auc: 0.794122	train's online_auc: 0.805706	valid1's auc: 0.740157	valid1's online_auc: 0.736084
[1361]	train's auc: 0.794146	train's online_auc: 0.805758	valid1's auc: 0.74015	valid1's online_auc: 0.736042
[13

[1428]	train's auc: 0.795933	train's online_auc: 0.807912	valid1's auc: 0.740214	valid1's online_auc: 0.736178
[1429]	train's auc: 0.795952	train's online_auc: 0.807926	valid1's auc: 0.740212	valid1's online_auc: 0.736184
[1430]	train's auc: 0.79597	train's online_auc: 0.807943	valid1's auc: 0.740215	valid1's online_auc: 0.736179
[1431]	train's auc: 0.795992	train's online_auc: 0.807964	valid1's auc: 0.740209	valid1's online_auc: 0.736185
[1432]	train's auc: 0.796018	train's online_auc: 0.807991	valid1's auc: 0.740205	valid1's online_auc: 0.73619
[1433]	train's auc: 0.796039	train's online_auc: 0.808002	valid1's auc: 0.740199	valid1's online_auc: 0.736188
[1434]	train's auc: 0.796053	train's online_auc: 0.808011	valid1's auc: 0.740207	valid1's online_auc: 0.736185
[1435]	train's auc: 0.796072	train's online_auc: 0.808023	valid1's auc: 0.740209	valid1's online_auc: 0.73619
[1436]	train's auc: 0.796081	train's online_auc: 0.808038	valid1's auc: 0.740212	valid1's online_auc: 0.736192
[143

In [11]:
# # these will cause LightGBM bug I don't know how to fix. Just skip it
# log_file = 'v2.feature_importance.csv'
# log_path = os.path.join(log_folder, log_file)

# df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
# df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
# df_feature_importance.to_csv(log_path, index=False)
# df_feature_importance.head(30)

In [12]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score

[23:40:41] Finish making prediction on validation set. △M: +840.77MB. △T: 59.4 seconds.


Unnamed: 0,aid,auc
0,6,0.645416
1,7,0.820753
2,12,0.851591
3,18,0.601341
4,70,0.865545
5,74,0.676643
6,86,0.646861
7,98,0.758323
8,113,0.542051
9,117,0.673624


In [13]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.736204
Simple AUC: 0.740210


In [14]:
log_file = 'v3.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

In [15]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[23:41:49] Finish cleaning memory. △M: -5.07GB. △T: 4.0 seconds.


In [16]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
    
with pu.profiler("trimming testing set"):    
    X_test = X_test[:, mask]
    gc.collect()

print("Test Data Shape (After masking): {}".format(X_test.shape))
# print("Test Column Numbers: {}".format(len(cols_test)))

[23:42:13] Finish loading testing data. △M: +814.42MB. △T: 18.3 seconds.
[23:42:19] Finish trimming testing set. △M: -685.46MB. △T: 5.8 seconds.
Test Data Shape (After masking): (2265989, 395570)


In [18]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[23:44:22] Finish making prediction on testing set. △M: -44.59MB. △T: 1.3 minutes.


In [19]:
subm_folder = '../../../subm/lgbm/0509_v3'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)