In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# defined feature pairs to load cross product transformation
pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]

# instantiate data loader
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
# cross_bin_loader = dm.build_data("cross", "binary_v1")
cross_bin_loaders = (dp.CrossBinaryDataManager.build_data(ad_feat_name, user_feat_name) 
                     for ad_feat_name, user_feat_name in pairs)
# union_loader = dp.DataUnion(bin_loader, cross_bin_loader
union_loader = dp.DataUnion(bin_loader, *cross_bin_loaders)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[07:40:37] Finish loading training data. △M: +3.45GB. △T: 2.0 minutes.
Train Data Shape: (8798814, 570730)
Train Column Numbers: 570730


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

print("Training Set Size (Before masking): {}".format(X_train.shape))
print("Validation Set Size (Before masking): {}".format(X_valid.shape))
    
min_df = 3
with pu.profiler("trimming train/valid set"):    
    mask = (X_train.getnnz(axis=0) >= min_df)  # threshold to be extendee
    cols_masked = list(compress(cols_train, mask))
    X_train = X_train[:, mask]
    X_valid = X_valid[:, mask]
    gc.collect()
    
print("Training Set Size (After masking): {}".format(X_train.shape))
print("Validation Set Size (After masking): {}".format(X_valid.shape))

[07:41:11] Finish splitting train/valid set. △M: +144.35MB. △T: 28.2 seconds.
Training Set Size (Before masking): (7039050, 570730)
Validation Set Size (Before masking): (1759764, 570730)
[07:42:10] Finish trimming train/valid set. △M: +24.55MB. △T: 59.2 seconds.
Training Set Size (After masking): (7039050, 395570)
Validation Set Size (After masking): (1759764, 395570)


In [6]:
with pu.profiler("preparing LightGBM data"):
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[07:42:16] Finish preparing LightGBM data. △M: +5.4GB. △T: 6.4 seconds.


In [7]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0509/')
log_file = 'v4.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [8]:
# # v2 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 15,
#     'num_leaves': 120,
#     'learning_rate': 0.15,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'verbose': 0
# }
# # v3 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 8,
#     'num_leaves': 120,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.7,
#     'bagging_fraction': 0.7,
#     'verbose': 0
# }
# v4 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 20,
    'num_leaves': 1024,
    'learning_rate': 0.1,
    # 'feature_fraction': 0.7,
    # 'bagging_fraction': 0.7,
    'min_data_in_leaf': 5,
    'verbose': 0
}
num_rounds = 2000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.68361	train's online_auc: 0.673688	valid1's auc: 0.682014	valid1's online_auc: 0.671819
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.68678	train's online_auc: 0.677697	valid1's auc: 0.685117	valid1's online_auc: 0.675344
[3]	train's auc: 0.693728	train's online_auc: 0.685166	valid1's auc: 0.692152	valid1's online_auc: 0.682836
[4]	train's auc: 0.697132	train's online_auc: 0.689876	valid1's auc: 0.695294	valid1's online_auc: 0.687504
[5]	train's auc: 0.698501	train's online_auc: 0.691599	valid1's auc: 0.696221	valid1's online_auc: 0.688481
[6]	train's auc: 0.700368	train's online_auc: 0.693725	valid1's auc: 0.697779	valid1's online_auc: 0.689851
[7]	train's auc: 0.700915	train's online_auc: 0.694549	valid1's auc: 0.69793	valid1's online_auc: 0.690064
[8]	train's auc: 0.702942	train's online_auc: 0.696712	valid1's auc: 0.699587	valid1's online_auc: 0.691909
[9]	train's auc: 0.704021	train's online_auc: 0.697767	valid1's auc: 0.700453

[76]	train's auc: 0.807918	train's online_auc: 0.808372	valid1's auc: 0.728262	valid1's online_auc: 0.724031
[77]	train's auc: 0.808569	train's online_auc: 0.809192	valid1's auc: 0.728297	valid1's online_auc: 0.724149
[78]	train's auc: 0.809063	train's online_auc: 0.809582	valid1's auc: 0.728391	valid1's online_auc: 0.724214
[79]	train's auc: 0.809415	train's online_auc: 0.810039	valid1's auc: 0.728471	valid1's online_auc: 0.724328
[80]	train's auc: 0.80999	train's online_auc: 0.810642	valid1's auc: 0.728513	valid1's online_auc: 0.724281
[81]	train's auc: 0.810842	train's online_auc: 0.811547	valid1's auc: 0.728634	valid1's online_auc: 0.724333
[82]	train's auc: 0.811189	train's online_auc: 0.811915	valid1's auc: 0.728728	valid1's online_auc: 0.724409
[83]	train's auc: 0.812023	train's online_auc: 0.812737	valid1's auc: 0.728797	valid1's online_auc: 0.724474
[84]	train's auc: 0.812425	train's online_auc: 0.813263	valid1's auc: 0.728839	valid1's online_auc: 0.724454
[85]	train's auc: 0.

[151]	train's auc: 0.84051	train's online_auc: 0.845185	valid1's auc: 0.732629	valid1's online_auc: 0.728071
[152]	train's auc: 0.840878	train's online_auc: 0.845546	valid1's auc: 0.732644	valid1's online_auc: 0.728094
[153]	train's auc: 0.841114	train's online_auc: 0.845806	valid1's auc: 0.732656	valid1's online_auc: 0.728104
[154]	train's auc: 0.841395	train's online_auc: 0.846153	valid1's auc: 0.732667	valid1's online_auc: 0.728093
[155]	train's auc: 0.841651	train's online_auc: 0.846402	valid1's auc: 0.732682	valid1's online_auc: 0.728088
[156]	train's auc: 0.841936	train's online_auc: 0.846735	valid1's auc: 0.732741	valid1's online_auc: 0.72815
[157]	train's auc: 0.842329	train's online_auc: 0.847078	valid1's auc: 0.73276	valid1's online_auc: 0.728208
[158]	train's auc: 0.842559	train's online_auc: 0.847289	valid1's auc: 0.732776	valid1's online_auc: 0.728217
[159]	train's auc: 0.842727	train's online_auc: 0.847472	valid1's auc: 0.732801	valid1's online_auc: 0.728231
[160]	train's

[226]	train's auc: 0.859363	train's online_auc: 0.86611	valid1's auc: 0.733956	valid1's online_auc: 0.729228
[227]	train's auc: 0.859505	train's online_auc: 0.866295	valid1's auc: 0.733958	valid1's online_auc: 0.729243
[228]	train's auc: 0.859569	train's online_auc: 0.866395	valid1's auc: 0.733995	valid1's online_auc: 0.729294
[229]	train's auc: 0.8597	train's online_auc: 0.866505	valid1's auc: 0.734011	valid1's online_auc: 0.729312
[230]	train's auc: 0.859878	train's online_auc: 0.867047	valid1's auc: 0.734006	valid1's online_auc: 0.729252
[231]	train's auc: 0.859998	train's online_auc: 0.867245	valid1's auc: 0.734028	valid1's online_auc: 0.72926
[232]	train's auc: 0.860225	train's online_auc: 0.867455	valid1's auc: 0.734053	valid1's online_auc: 0.729339
[233]	train's auc: 0.860482	train's online_auc: 0.867866	valid1's auc: 0.734065	valid1's online_auc: 0.729303
[234]	train's auc: 0.86073	train's online_auc: 0.868288	valid1's auc: 0.734101	valid1's online_auc: 0.729378
[235]	train's a

[301]	train's auc: 0.873614	train's online_auc: 0.882869	valid1's auc: 0.734988	valid1's online_auc: 0.730328
[302]	train's auc: 0.873724	train's online_auc: 0.882966	valid1's auc: 0.734994	valid1's online_auc: 0.730337
[303]	train's auc: 0.873876	train's online_auc: 0.883093	valid1's auc: 0.73501	valid1's online_auc: 0.73034
[304]	train's auc: 0.87401	train's online_auc: 0.883234	valid1's auc: 0.735009	valid1's online_auc: 0.730359
[305]	train's auc: 0.874202	train's online_auc: 0.883397	valid1's auc: 0.735031	valid1's online_auc: 0.730373
[306]	train's auc: 0.874262	train's online_auc: 0.88344	valid1's auc: 0.735025	valid1's online_auc: 0.730365
[307]	train's auc: 0.874382	train's online_auc: 0.883606	valid1's auc: 0.735029	valid1's online_auc: 0.730366
[308]	train's auc: 0.874609	train's online_auc: 0.883876	valid1's auc: 0.735053	valid1's online_auc: 0.730409
[309]	train's auc: 0.874717	train's online_auc: 0.883984	valid1's auc: 0.735049	valid1's online_auc: 0.730417
[310]	train's 

[376]	train's auc: 0.88508	train's online_auc: 0.895954	valid1's auc: 0.735555	valid1's online_auc: 0.730866
[377]	train's auc: 0.885196	train's online_auc: 0.896081	valid1's auc: 0.735554	valid1's online_auc: 0.73086
[378]	train's auc: 0.885315	train's online_auc: 0.896186	valid1's auc: 0.73556	valid1's online_auc: 0.73087
[379]	train's auc: 0.885559	train's online_auc: 0.896464	valid1's auc: 0.735573	valid1's online_auc: 0.730866
[380]	train's auc: 0.885652	train's online_auc: 0.896542	valid1's auc: 0.735578	valid1's online_auc: 0.730913
[381]	train's auc: 0.885864	train's online_auc: 0.896784	valid1's auc: 0.735604	valid1's online_auc: 0.730915
[382]	train's auc: 0.886002	train's online_auc: 0.896911	valid1's auc: 0.735595	valid1's online_auc: 0.730902
[383]	train's auc: 0.886172	train's online_auc: 0.897045	valid1's auc: 0.735596	valid1's online_auc: 0.730922
[384]	train's auc: 0.8862	train's online_auc: 0.897104	valid1's auc: 0.735597	valid1's online_auc: 0.730921
[385]	train's au

[451]	train's auc: 0.894433	train's online_auc: 0.906574	valid1's auc: 0.735952	valid1's online_auc: 0.731681
[452]	train's auc: 0.894553	train's online_auc: 0.906767	valid1's auc: 0.735989	valid1's online_auc: 0.731773
[453]	train's auc: 0.894678	train's online_auc: 0.906947	valid1's auc: 0.735994	valid1's online_auc: 0.7318
[454]	train's auc: 0.894838	train's online_auc: 0.907066	valid1's auc: 0.736005	valid1's online_auc: 0.73178
[455]	train's auc: 0.895066	train's online_auc: 0.907289	valid1's auc: 0.736032	valid1's online_auc: 0.731795
[456]	train's auc: 0.895214	train's online_auc: 0.907377	valid1's auc: 0.73603	valid1's online_auc: 0.731784
[457]	train's auc: 0.895308	train's online_auc: 0.907435	valid1's auc: 0.736018	valid1's online_auc: 0.73179
[458]	train's auc: 0.895449	train's online_auc: 0.907521	valid1's auc: 0.736022	valid1's online_auc: 0.731803
[459]	train's auc: 0.895593	train's online_auc: 0.907611	valid1's auc: 0.736019	valid1's online_auc: 0.731787
[460]	train's a

[526]	train's auc: 0.901374	train's online_auc: 0.914339	valid1's auc: 0.736132	valid1's online_auc: 0.732163
[527]	train's auc: 0.901421	train's online_auc: 0.914383	valid1's auc: 0.736122	valid1's online_auc: 0.732156
[528]	train's auc: 0.901504	train's online_auc: 0.914492	valid1's auc: 0.736119	valid1's online_auc: 0.732192
[529]	train's auc: 0.90154	train's online_auc: 0.914516	valid1's auc: 0.736117	valid1's online_auc: 0.73221
[530]	train's auc: 0.901652	train's online_auc: 0.9146	valid1's auc: 0.736114	valid1's online_auc: 0.732197
[531]	train's auc: 0.901716	train's online_auc: 0.914668	valid1's auc: 0.736115	valid1's online_auc: 0.732202
[532]	train's auc: 0.901814	train's online_auc: 0.914749	valid1's auc: 0.736103	valid1's online_auc: 0.732197
[533]	train's auc: 0.901945	train's online_auc: 0.914892	valid1's auc: 0.736098	valid1's online_auc: 0.732196
[534]	train's auc: 0.90207	train's online_auc: 0.914977	valid1's auc: 0.736096	valid1's online_auc: 0.732177
[535]	train's a

In [10]:
# # these will cause LightGBM bug I don't know how to fix. Just skip it
# log_file = 'v2.feature_importance.csv'
# log_path = os.path.join(log_folder, log_file)

# df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
# df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
# df_feature_importance.to_csv(log_path, index=False)
# df_feature_importance.head(30)

In [11]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score

[23:43:22] Finish making prediction on validation set. △M: +775.95MB. △T: 1.0 minutes.


Unnamed: 0,aid,auc
0,6,0.651663
1,7,0.814581
2,12,0.853471
3,18,0.614885
4,70,0.862495
5,74,0.679849
6,86,0.639863
7,98,0.772705
8,113,0.527175
9,117,0.664712


In [12]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.732146
Simple AUC: 0.736190


In [13]:
log_file = 'v4.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

In [14]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[23:44:33] Finish cleaning memory. △M: -21.45GB. △T: 4.9 seconds.


In [15]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
    
with pu.profiler("trimming testing set"):    
    X_test = X_test[:, mask]
    gc.collect()

print("Test Data Shape (After masking): {}".format(X_test.shape))
# print("Test Column Numbers: {}".format(len(cols_test)))

[23:44:50] Finish loading testing data. △M: +68.27MB. △T: 12.9 seconds.
[23:44:55] Finish trimming testing set. △M: +26.07MB. △T: 5.6 seconds.
Test Data Shape (After masking): (2265989, 395570)


In [16]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[23:46:35] Finish making prediction on testing set. △M: -42.82MB. △T: 1.1 minutes.


In [17]:
subm_folder = '../../../subm/lgbm/0509_v4'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)