In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import json
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
input_folder = os.path.join(config.DATA_DIR, "input_final")
dm = dp.DataManager(input_folder)

lc_v1 = dm.build_data('aggregate.listCount_v1')
lc_v2 = dm.build_data('aggregate.listCount_v2')
clk_v1 = dm.build_data('clickStats.crossWordCount_v1')
clk_v2 = dm.build_data('clickStats.crossWordCount_v2')
stack_cross_v1 = dm.build_data('stacking.lr.crossWordCount_v1')
stack_cross_v1_all = dm.build_data('stacking.lr.crossWordCount_v1_all')
stack_cross_v2 = dm.build_data('stacking.lr.crossWordCount_v2')
stack_cross_v3 = dm.build_data('stacking.lr.crossWordCount_v3')
stack_cross_v3_all = dm.build_data('stacking.lr.crossWordCount_v3_all')
stack_emb = dm.build_data('stacking.lr.wordEmbedding')
stack_tfidf = dm.build_data('stacking.lr.wordTfIdf')
cross = dm.build_data('cross.wordCount_v2')
raw = dm.build_data('raw.wordCount')

In [3]:
union_loader = dp.DataUnion(lc_v1, lc_v2, 
                            clk_v1, clk_v2,
                            stack_cross_v1, stack_cross_v1_all, 
                            stack_cross_v2, 
                            stack_cross_v3, stack_cross_v3_all, 
                            stack_emb, 
                            stack_tfidf, 
                            cross, 
                            raw)

In [4]:
with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()
print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

loading data: 100%|██████████| 13/13 [06:23<00:00, 29.52s/it]


[01:48:04] Finish loading training data. △M: +19.2GB. △T: 7.4 minutes.
Train Data Shape: (8798814, 499726)
Train Column Numbers: 499726


In [5]:
n_splits = 5
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]
# n_splits = 5  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits, random_state=2018)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[01:49:41] Finish splitting train/valid set. △M: -15.6GB. △T: 1.3 minutes.
Training Set Size: (7039051, 499726)
Validation Set Size: (1759763, 499726)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    del X_train  # to save memory
    gc.collect()
    
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[01:51:04] Finish preparing LightGBM data. △M: +12.8GB. △T: 53.3 seconds.


In [7]:
version_name = "final/v3"
log_folder = os.path.join(config.LOG_DIR, 'lgbm/{}'.format(version_name))
os.makedirs(log_folder, exist_ok=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 64,
    'learning_rate': 0.16,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 480

log_file = 'params.json'
log_path = os.path.join(log_folder, log_file)
with open(log_path, 'w') as f:
    json.dump(params, f, indent=4)

log_file = 'log.txt'
log_path = os.path.join(log_folder, log_file)
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)
with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.702302	train's online_auc: 0.700284	valid's auc: 0.699138	valid's online_auc: 0.698175
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.714534	train's online_auc: 0.712605	valid's auc: 0.711118	valid's online_auc: 0.710127
[3]	train's auc: 0.718374	train's online_auc: 0.71623	valid's auc: 0.714986	valid's online_auc: 0.714076
[4]	train's auc: 0.721249	train's online_auc: 0.718872	valid's auc: 0.717959	valid's online_auc: 0.71695
[5]	train's auc: 0.723473	train's online_auc: 0.720745	valid's auc: 0.720148	valid's online_auc: 0.719142
[6]	train's auc: 0.724244	train's online_auc: 0.721508	valid's auc: 0.721071	valid's online_auc: 0.720062
[7]	train's auc: 0.725026	train's online_auc: 0.722313	valid's auc: 0.721622	valid's online_auc: 0.72064
[8]	train's auc: 0.726011	train's online_auc: 0.722955	valid's auc: 0.722423	valid's online_auc: 0.720877
[9]	train's auc: 0.726713	train's online_auc: 0.723544	valid's auc: 0.723132	valid's online_a

[78]	train's auc: 0.747948	train's online_auc: 0.744378	valid's auc: 0.739146	valid's online_auc: 0.735232
[79]	train's auc: 0.748041	train's online_auc: 0.744495	valid's auc: 0.739182	valid's online_auc: 0.735232
[80]	train's auc: 0.748173	train's online_auc: 0.744644	valid's auc: 0.739249	valid's online_auc: 0.735296
[81]	train's auc: 0.748361	train's online_auc: 0.744839	valid's auc: 0.739374	valid's online_auc: 0.735349
[82]	train's auc: 0.74848	train's online_auc: 0.744965	valid's auc: 0.739403	valid's online_auc: 0.73535
[83]	train's auc: 0.748647	train's online_auc: 0.745262	valid's auc: 0.739439	valid's online_auc: 0.735356
[84]	train's auc: 0.748726	train's online_auc: 0.745341	valid's auc: 0.739491	valid's online_auc: 0.735379
[85]	train's auc: 0.748855	train's online_auc: 0.7455	valid's auc: 0.739547	valid's online_auc: 0.735419
[86]	train's auc: 0.748948	train's online_auc: 0.745539	valid's auc: 0.739577	valid's online_auc: 0.73541
[87]	train's auc: 0.749104	train's online_

[155]	train's auc: 0.755033	train's online_auc: 0.753415	valid's auc: 0.741601	valid's online_auc: 0.737379
[156]	train's auc: 0.755113	train's online_auc: 0.753519	valid's auc: 0.741617	valid's online_auc: 0.737372
[157]	train's auc: 0.755188	train's online_auc: 0.753716	valid's auc: 0.741607	valid's online_auc: 0.73732
[158]	train's auc: 0.755254	train's online_auc: 0.753769	valid's auc: 0.741613	valid's online_auc: 0.737327
[159]	train's auc: 0.755301	train's online_auc: 0.753815	valid's auc: 0.741629	valid's online_auc: 0.737343
[160]	train's auc: 0.755351	train's online_auc: 0.753853	valid's auc: 0.74162	valid's online_auc: 0.737335
[161]	train's auc: 0.7554	train's online_auc: 0.753988	valid's auc: 0.741618	valid's online_auc: 0.737333
[162]	train's auc: 0.755452	train's online_auc: 0.754043	valid's auc: 0.741613	valid's online_auc: 0.737337
[163]	train's auc: 0.755482	train's online_auc: 0.754089	valid's auc: 0.741626	valid's online_auc: 0.73734
[164]	train's auc: 0.755624	train

[232]	train's auc: 0.759465	train's online_auc: 0.759285	valid's auc: 0.742483	valid's online_auc: 0.738018
[233]	train's auc: 0.75951	train's online_auc: 0.759345	valid's auc: 0.742482	valid's online_auc: 0.738028
[234]	train's auc: 0.75956	train's online_auc: 0.759421	valid's auc: 0.742493	valid's online_auc: 0.738047
[235]	train's auc: 0.759611	train's online_auc: 0.759485	valid's auc: 0.742497	valid's online_auc: 0.738034
[236]	train's auc: 0.759661	train's online_auc: 0.759573	valid's auc: 0.742512	valid's online_auc: 0.738032
[237]	train's auc: 0.759723	train's online_auc: 0.759603	valid's auc: 0.742534	valid's online_auc: 0.738018
[238]	train's auc: 0.759796	train's online_auc: 0.759673	valid's auc: 0.742551	valid's online_auc: 0.73804
[239]	train's auc: 0.759911	train's online_auc: 0.759866	valid's auc: 0.742577	valid's online_auc: 0.738076
[240]	train's auc: 0.760063	train's online_auc: 0.760112	valid's auc: 0.742634	valid's online_auc: 0.738151
[241]	train's auc: 0.760131	tra

[309]	train's auc: 0.76333	train's online_auc: 0.764338	valid's auc: 0.742913	valid's online_auc: 0.738341
[310]	train's auc: 0.763375	train's online_auc: 0.764378	valid's auc: 0.742906	valid's online_auc: 0.738325
[311]	train's auc: 0.763422	train's online_auc: 0.76449	valid's auc: 0.742915	valid's online_auc: 0.738315
[312]	train's auc: 0.763445	train's online_auc: 0.764523	valid's auc: 0.74291	valid's online_auc: 0.738313
[313]	train's auc: 0.763551	train's online_auc: 0.764643	valid's auc: 0.742996	valid's online_auc: 0.738368
[314]	train's auc: 0.763601	train's online_auc: 0.764685	valid's auc: 0.743003	valid's online_auc: 0.738365
[315]	train's auc: 0.763665	train's online_auc: 0.764733	valid's auc: 0.743033	valid's online_auc: 0.73838
[316]	train's auc: 0.763735	train's online_auc: 0.764791	valid's auc: 0.743034	valid's online_auc: 0.738381
[317]	train's auc: 0.763779	train's online_auc: 0.764817	valid's auc: 0.743026	valid's online_auc: 0.738378
[318]	train's auc: 0.763805	trai

[386]	train's auc: 0.76634	train's online_auc: 0.768218	valid's auc: 0.743162	valid's online_auc: 0.738546
[387]	train's auc: 0.7664	train's online_auc: 0.768298	valid's auc: 0.74316	valid's online_auc: 0.738534
[388]	train's auc: 0.76643	train's online_auc: 0.768323	valid's auc: 0.743159	valid's online_auc: 0.738538
[389]	train's auc: 0.766459	train's online_auc: 0.768361	valid's auc: 0.74316	valid's online_auc: 0.738534
[390]	train's auc: 0.766553	train's online_auc: 0.768549	valid's auc: 0.743178	valid's online_auc: 0.738582
[391]	train's auc: 0.766602	train's online_auc: 0.768606	valid's auc: 0.743169	valid's online_auc: 0.738572
[392]	train's auc: 0.766643	train's online_auc: 0.768649	valid's auc: 0.743164	valid's online_auc: 0.738581
[393]	train's auc: 0.766669	train's online_auc: 0.768673	valid's auc: 0.743163	valid's online_auc: 0.738576
[394]	train's auc: 0.766689	train's online_auc: 0.768702	valid's auc: 0.743159	valid's online_auc: 0.73856
[395]	train's auc: 0.766732	train's

In [8]:
log_file = 'feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)

In [9]:
df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid.astype(np.float32))
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)

online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

log_file = 'online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

Online AUC: 0.738582
Simple AUC: 0.743178


In [10]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    # del X_train
    del X_valid
    gc.collect()

[03:11:18] Finish cleaning memory. △M: -10.9GB. △T: 1.7 seconds.


In [11]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test2")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
print("Test Data Shape: {}".format(X_test.shape))
print("Test Column Numbers: {}".format(len(cols_test)))

df_test = du.load_raw_data("test2")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]
    
subm_folder = '../../../subm/lgbm/{}'.format(version_name)
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)

loading data: 100%|██████████| 13/13 [01:04<00:00,  4.99s/it]


[03:12:29] Finish loading testing data. △M: +3.32GB. △T: 1.2 minutes.
Test Data Shape: (2265879, 469011)
Test Column Numbers: 469011
[03:13:05] Finish making prediction on testing set. △M: +0B. △T: 32.8 seconds.
