In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import json
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
input_folder = os.path.join(config.DATA_DIR, "input_final")
dm = dp.DataManager(input_folder)

lc_v1 = dm.build_data('aggregate.listCount_v1')
lc_v2 = dm.build_data('aggregate.listCount_v2')
clk_v1 = dm.build_data('clickStats.crossWordCount_v1')
clk_v2 = dm.build_data('clickStats.crossWordCount_v2')
stack_cross_v1 = dm.build_data('stacking.lr.crossWordCount_v1')
stack_cross_v1_all = dm.build_data('stacking.lr.crossWordCount_v1_all')
stack_cross_v2 = dm.build_data('stacking.lr.crossWordCount_v2')
stack_cross_v3 = dm.build_data('stacking.lr.crossWordCount_v3')
stack_cross_v3_all = dm.build_data('stacking.lr.crossWordCount_v3_all')
stack_emb = dm.build_data('stacking.lr.wordEmbedding')
stack_tfidf = dm.build_data('stacking.lr.wordTfIdf')
raw = dm.build_data('raw.wordCount')

In [3]:
union_loader = dp.DataUnion(lc_v1, lc_v2, 
                            clk_v1, clk_v2,
                            stack_cross_v1, stack_cross_v1_all, 
                            stack_cross_v2, 
                            stack_cross_v3, stack_cross_v3_all, 
                            stack_emb, 
                            stack_tfidf, 
                            raw)

In [4]:
with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()
print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

loading data: 100%|██████████| 12/12 [05:27<00:00, 27.26s/it]


[20:47:20] Finish loading training data. △M: +18.22GB. △T: 6.2 minutes.
Train Data Shape: (8798814, 419818)
Train Column Numbers: 419818


In [5]:
n_splits = 5
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=2018)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]
# n_splits = 5  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits, random_state=2018)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[20:49:04] Finish splitting train/valid set. △M: -8.1GB. △T: 1.2 minutes.
Training Set Size: (7039051, 419818)
Validation Set Size: (1759763, 419818)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    del X_train  # to save memory
    gc.collect()
    
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[20:49:55] Finish preparing LightGBM data. △M: +5.43GB. △T: 36.3 seconds.


In [None]:
version_name = "final/v1"
log_folder = os.path.join(config.LOG_DIR, 'lgbm/{}'.format(version_name))
os.makedirs(log_folder, exist_ok=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 64,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 1000

log_file = 'params.json'
log_path = os.path.join(log_folder, log_file)
with open(log_path, 'w') as f:
    json.dump(params, f, indent=4)

log_file = 'log.txt'
log_path = os.path.join(log_folder, log_file)
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)
with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.702097	train's online_auc: 0.700603	valid's auc: 0.698871	valid's online_auc: 0.698403
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.714182	train's online_auc: 0.712785	valid's auc: 0.710257	valid's online_auc: 0.70988
[3]	train's auc: 0.71869	train's online_auc: 0.716769	valid's auc: 0.715146	valid's online_auc: 0.714374
[4]	train's auc: 0.720769	train's online_auc: 0.718085	valid's auc: 0.717104	valid's online_auc: 0.715745
[5]	train's auc: 0.723049	train's online_auc: 0.720815	valid's auc: 0.719412	valid's online_auc: 0.718545
[6]	train's auc: 0.723949	train's online_auc: 0.721254	valid's auc: 0.720303	valid's online_auc: 0.719151
[7]	train's auc: 0.724359	train's online_auc: 0.721583	valid's auc: 0.720808	valid's online_auc: 0.71951
[8]	train's auc: 0.724373	train's online_auc: 0.72132	valid's auc: 0.720806	valid's online_auc: 0.719389
[9]	train's auc: 0.72488	train's online_auc: 0.72169	valid's auc: 0.721192	valid's online_auc:

[78]	train's auc: 0.743665	train's online_auc: 0.739862	valid's auc: 0.736631	valid's online_auc: 0.733173
[79]	train's auc: 0.743845	train's online_auc: 0.740106	valid's auc: 0.736766	valid's online_auc: 0.733347
[80]	train's auc: 0.74395	train's online_auc: 0.740228	valid's auc: 0.736827	valid's online_auc: 0.733417
[81]	train's auc: 0.744092	train's online_auc: 0.740414	valid's auc: 0.736876	valid's online_auc: 0.733467
[82]	train's auc: 0.744215	train's online_auc: 0.740544	valid's auc: 0.736942	valid's online_auc: 0.733462
[83]	train's auc: 0.744308	train's online_auc: 0.740611	valid's auc: 0.736997	valid's online_auc: 0.733475
[84]	train's auc: 0.74446	train's online_auc: 0.74082	valid's auc: 0.737097	valid's online_auc: 0.733614
[85]	train's auc: 0.744545	train's online_auc: 0.740964	valid's auc: 0.737137	valid's online_auc: 0.733656
[86]	train's auc: 0.744634	train's online_auc: 0.741052	valid's auc: 0.737192	valid's online_auc: 0.733696
[87]	train's auc: 0.744754	train's onlin

[155]	train's auc: 0.751026	train's online_auc: 0.748478	valid's auc: 0.740641	valid's online_auc: 0.736828
[156]	train's auc: 0.751098	train's online_auc: 0.74856	valid's auc: 0.740685	valid's online_auc: 0.736861
[157]	train's auc: 0.751164	train's online_auc: 0.748597	valid's auc: 0.740712	valid's online_auc: 0.73689
[158]	train's auc: 0.75122	train's online_auc: 0.748644	valid's auc: 0.740754	valid's online_auc: 0.73691
[159]	train's auc: 0.751262	train's online_auc: 0.748687	valid's auc: 0.740765	valid's online_auc: 0.736933
[160]	train's auc: 0.7513	train's online_auc: 0.748728	valid's auc: 0.740795	valid's online_auc: 0.736942
[161]	train's auc: 0.751361	train's online_auc: 0.748791	valid's auc: 0.740806	valid's online_auc: 0.736943
[162]	train's auc: 0.751442	train's online_auc: 0.748873	valid's auc: 0.740854	valid's online_auc: 0.736957
[163]	train's auc: 0.751503	train's online_auc: 0.748938	valid's auc: 0.740859	valid's online_auc: 0.736966
[164]	train's auc: 0.751551	train'

[232]	train's auc: 0.756031	train's online_auc: 0.755037	valid's auc: 0.742433	valid's online_auc: 0.738689
[233]	train's auc: 0.756075	train's online_auc: 0.755071	valid's auc: 0.742443	valid's online_auc: 0.738692
[234]	train's auc: 0.756128	train's online_auc: 0.755146	valid's auc: 0.742454	valid's online_auc: 0.738713
[235]	train's auc: 0.756163	train's online_auc: 0.755209	valid's auc: 0.742459	valid's online_auc: 0.738718
[236]	train's auc: 0.756199	train's online_auc: 0.755276	valid's auc: 0.742475	valid's online_auc: 0.738752
[237]	train's auc: 0.756277	train's online_auc: 0.75538	valid's auc: 0.742497	valid's online_auc: 0.73877
[238]	train's auc: 0.75632	train's online_auc: 0.755412	valid's auc: 0.742498	valid's online_auc: 0.738767
[239]	train's auc: 0.756362	train's online_auc: 0.755482	valid's auc: 0.742504	valid's online_auc: 0.738773
[240]	train's auc: 0.756407	train's online_auc: 0.755537	valid's auc: 0.742524	valid's online_auc: 0.738789
[241]	train's auc: 0.75645	trai

[309]	train's auc: 0.75924	train's online_auc: 0.759391	valid's auc: 0.743102	valid's online_auc: 0.739241
[310]	train's auc: 0.759326	train's online_auc: 0.759511	valid's auc: 0.743115	valid's online_auc: 0.739247
[311]	train's auc: 0.759347	train's online_auc: 0.759533	valid's auc: 0.743123	valid's online_auc: 0.739251
[312]	train's auc: 0.75936	train's online_auc: 0.759548	valid's auc: 0.743121	valid's online_auc: 0.73925
[313]	train's auc: 0.759415	train's online_auc: 0.759619	valid's auc: 0.743125	valid's online_auc: 0.739248
[314]	train's auc: 0.759452	train's online_auc: 0.759689	valid's auc: 0.743133	valid's online_auc: 0.739257
[315]	train's auc: 0.759526	train's online_auc: 0.759745	valid's auc: 0.743165	valid's online_auc: 0.739265
[316]	train's auc: 0.75959	train's online_auc: 0.760059	valid's auc: 0.743182	valid's online_auc: 0.739365
[317]	train's auc: 0.759629	train's online_auc: 0.760118	valid's auc: 0.74318	valid's online_auc: 0.739362
[318]	train's auc: 0.759695	train

[386]	train's auc: 0.762252	train's online_auc: 0.763349	valid's auc: 0.743562	valid's online_auc: 0.739742
[387]	train's auc: 0.76226	train's online_auc: 0.763361	valid's auc: 0.743565	valid's online_auc: 0.739751
[388]	train's auc: 0.762278	train's online_auc: 0.763391	valid's auc: 0.74357	valid's online_auc: 0.73975
[389]	train's auc: 0.762306	train's online_auc: 0.76343	valid's auc: 0.743571	valid's online_auc: 0.739738
[390]	train's auc: 0.762337	train's online_auc: 0.763464	valid's auc: 0.743582	valid's online_auc: 0.739773
[391]	train's auc: 0.762398	train's online_auc: 0.763518	valid's auc: 0.743589	valid's online_auc: 0.739775
[392]	train's auc: 0.76247	train's online_auc: 0.763662	valid's auc: 0.743626	valid's online_auc: 0.739834
[393]	train's auc: 0.762519	train's online_auc: 0.763731	valid's auc: 0.743634	valid's online_auc: 0.739845
[394]	train's auc: 0.762563	train's online_auc: 0.763763	valid's auc: 0.743645	valid's online_auc: 0.739854
[395]	train's auc: 0.762618	train

[463]	train's auc: 0.764626	train's online_auc: 0.766734	valid's auc: 0.743922	valid's online_auc: 0.740043
[464]	train's auc: 0.764649	train's online_auc: 0.766768	valid's auc: 0.743921	valid's online_auc: 0.740041
[465]	train's auc: 0.764671	train's online_auc: 0.766803	valid's auc: 0.743927	valid's online_auc: 0.740045
[466]	train's auc: 0.764698	train's online_auc: 0.766834	valid's auc: 0.743929	valid's online_auc: 0.740052
[467]	train's auc: 0.764727	train's online_auc: 0.766885	valid's auc: 0.743932	valid's online_auc: 0.74006
[468]	train's auc: 0.764744	train's online_auc: 0.766908	valid's auc: 0.743929	valid's online_auc: 0.740049
[469]	train's auc: 0.764813	train's online_auc: 0.767062	valid's auc: 0.743925	valid's online_auc: 0.740056
[470]	train's auc: 0.764823	train's online_auc: 0.767076	valid's auc: 0.743926	valid's online_auc: 0.740062
[471]	train's auc: 0.764837	train's online_auc: 0.767088	valid's auc: 0.743927	valid's online_auc: 0.74006
[472]	train's auc: 0.764858	tr

[540]	train's auc: 0.766778	train's online_auc: 0.76944	valid's auc: 0.744103	valid's online_auc: 0.740164
[541]	train's auc: 0.766799	train's online_auc: 0.769457	valid's auc: 0.744103	valid's online_auc: 0.740164
[542]	train's auc: 0.766813	train's online_auc: 0.769477	valid's auc: 0.744099	valid's online_auc: 0.740159
[543]	train's auc: 0.766866	train's online_auc: 0.769543	valid's auc: 0.744094	valid's online_auc: 0.74015
[544]	train's auc: 0.766926	train's online_auc: 0.769653	valid's auc: 0.744085	valid's online_auc: 0.740135
[545]	train's auc: 0.766959	train's online_auc: 0.769689	valid's auc: 0.744094	valid's online_auc: 0.740162
[546]	train's auc: 0.76698	train's online_auc: 0.769712	valid's auc: 0.744091	valid's online_auc: 0.740156
[547]	train's auc: 0.766989	train's online_auc: 0.769721	valid's auc: 0.744093	valid's online_auc: 0.740151
[548]	train's auc: 0.767026	train's online_auc: 0.769842	valid's auc: 0.74409	valid's online_auc: 0.740141
[549]	train's auc: 0.767037	trai

In [7]:
log_file = 'feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)

In [9]:
df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid.astype(np.float32))
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)

online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

log_file = 'online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

Online AUC: 0.725787
Simple AUC: 0.731557


In [10]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    # del X_train
    del X_valid
    gc.collect()

[03:40:42] Finish cleaning memory. △M: -5.66GB. △T: 1.0 seconds.


In [22]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test2")
    cols_train_set = set(cols_train)
    mask = [i for i, col in enumerate(cols_test) if col in cols_train_set]
    X_test = sparse.csr_matrix(X_test[:, mask])
    gc.collect()
print("Test Data Shape: {}".format(X_test.shape))
print("Test Column Numbers: {}".format(len(cols_test)))

df_test = du.load_raw_data("test2")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]
    
subm_folder = '../../../subm/lgbm/{}'.format(version_name)
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)

[04:38:37] Finish loading testing data. △M: -661.99MB. △T: 8.3 seconds.
Test Data Shape: (2265879, 419701)
Test Column Numbers: 419702
[04:38:59] Finish making prediction on testing set. △M: +0B. △T: 19.3 seconds.


In [21]:
len(mask)

419701