In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import json
import tqdm
import os
import gc
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
union_loader = dp.DataUnion(bin_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()
print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

[02:38:34] Finish loading training data. △M: +3.28GB. △T: 5.0 seconds.
Train Data Shape: (8798814, 419701)
Train Column Numbers: 419701


In [3]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]
n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits, random_state=2018)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[02:38:53] Finish splitting train/valid set. △M: +154.42MB. △T: 14.9 seconds.
Training Set Size: (7039050, 419701)
Validation Set Size: (1759764, 419701)


In [4]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    
    del X_train
    
    gc.collect()

[02:39:02] Finish preparing LightGBM data. △M: +2.58GB. △T: 8.9 seconds.


In [5]:
version_name = "rawBinary/v1"
log_folder = os.path.join(config.LOG_DIR, 'lgbm/{}'.format(version_name))
os.makedirs(log_folder, exist_ok=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 64,
    'learning_rate': 0.3,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'verbose': 0
}
num_rounds = 1000

log_file = 'params.json'
log_path = os.path.join(log_folder, log_file)
print("saving parameteres to {}".format(log_path))
with open(log_path, 'w') as f:
    json.dump(params, f, indent=4)

saving parameteres to /mnt/c/Users/cheng/Desktop/Competitons/TencentAlgo2018/log/lgbm/rawBinary/v1/params.json


In [6]:
log_file = 'log.txt'
log_path = os.path.join(log_folder, log_file)
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)
with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.592706	train's online_auc: 0.590354	valid's auc: 0.592897	valid's online_auc: 0.589816
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.64202	train's online_auc: 0.632065	valid's auc: 0.641023	valid's online_auc: 0.631593
[3]	train's auc: 0.648424	train's online_auc: 0.641378	valid's auc: 0.646868	valid's online_auc: 0.641788
[4]	train's auc: 0.656355	train's online_auc: 0.652782	valid's auc: 0.654943	valid's online_auc: 0.651528
[5]	train's auc: 0.657541	train's online_auc: 0.653472	valid's auc: 0.656073	valid's online_auc: 0.652396
[6]	train's auc: 0.658044	train's online_auc: 0.653308	valid's auc: 0.656664	valid's online_auc: 0.652734
[7]	train's auc: 0.661244	train's online_auc: 0.656072	valid's auc: 0.66008	valid's online_auc: 0.655517
[8]	train's auc: 0.667616	train's online_auc: 0.664945	valid's auc: 0.666186	valid's online_auc: 0.664912
[9]	train's auc: 0.672865	train's online_auc: 0.668927	valid's auc: 0.672161	valid's online_

[78]	train's auc: 0.724864	train's online_auc: 0.720086	valid's auc: 0.719	valid's online_auc: 0.712475
[79]	train's auc: 0.725053	train's online_auc: 0.720294	valid's auc: 0.719158	valid's online_auc: 0.712645
[80]	train's auc: 0.725202	train's online_auc: 0.720379	valid's auc: 0.719242	valid's online_auc: 0.712732
[81]	train's auc: 0.725362	train's online_auc: 0.720505	valid's auc: 0.719354	valid's online_auc: 0.712843
[82]	train's auc: 0.725485	train's online_auc: 0.720589	valid's auc: 0.71945	valid's online_auc: 0.712906
[83]	train's auc: 0.725579	train's online_auc: 0.720744	valid's auc: 0.719499	valid's online_auc: 0.712885
[84]	train's auc: 0.72573	train's online_auc: 0.720863	valid's auc: 0.71959	valid's online_auc: 0.71294
[85]	train's auc: 0.726114	train's online_auc: 0.721289	valid's auc: 0.719817	valid's online_auc: 0.71319
[86]	train's auc: 0.726286	train's online_auc: 0.721437	valid's auc: 0.71996	valid's online_auc: 0.713272
[87]	train's auc: 0.726512	train's online_auc:

[155]	train's auc: 0.737602	train's online_auc: 0.733377	valid's auc: 0.72613	valid's online_auc: 0.71984
[156]	train's auc: 0.737762	train's online_auc: 0.733654	valid's auc: 0.726197	valid's online_auc: 0.719868
[157]	train's auc: 0.73795	train's online_auc: 0.733816	valid's auc: 0.72633	valid's online_auc: 0.719991
[158]	train's auc: 0.73801	train's online_auc: 0.733907	valid's auc: 0.726374	valid's online_auc: 0.720006
[159]	train's auc: 0.738103	train's online_auc: 0.734025	valid's auc: 0.726394	valid's online_auc: 0.719996
[160]	train's auc: 0.738194	train's online_auc: 0.734132	valid's auc: 0.726391	valid's online_auc: 0.719996
[161]	train's auc: 0.738261	train's online_auc: 0.734183	valid's auc: 0.72638	valid's online_auc: 0.719997
[162]	train's auc: 0.738338	train's online_auc: 0.734248	valid's auc: 0.726374	valid's online_auc: 0.720004
[163]	train's auc: 0.738378	train's online_auc: 0.734307	valid's auc: 0.726377	valid's online_auc: 0.719994
[164]	train's auc: 0.738499	train'

[232]	train's auc: 0.745272	train's online_auc: 0.743287	valid's auc: 0.729119	valid's online_auc: 0.723302
[233]	train's auc: 0.745368	train's online_auc: 0.743584	valid's auc: 0.729147	valid's online_auc: 0.723443
[234]	train's auc: 0.745402	train's online_auc: 0.743634	valid's auc: 0.729166	valid's online_auc: 0.723476
[235]	train's auc: 0.745467	train's online_auc: 0.743734	valid's auc: 0.72916	valid's online_auc: 0.723446
[236]	train's auc: 0.745489	train's online_auc: 0.743747	valid's auc: 0.729159	valid's online_auc: 0.723491
[237]	train's auc: 0.745532	train's online_auc: 0.74379	valid's auc: 0.729164	valid's online_auc: 0.723485
[238]	train's auc: 0.745633	train's online_auc: 0.743932	valid's auc: 0.729255	valid's online_auc: 0.723611
[239]	train's auc: 0.745702	train's online_auc: 0.744009	valid's auc: 0.729257	valid's online_auc: 0.723645
[240]	train's auc: 0.745785	train's online_auc: 0.744141	valid's auc: 0.729257	valid's online_auc: 0.723623
[241]	train's auc: 0.745866	tr

[309]	train's auc: 0.750443	train's online_auc: 0.749228	valid's auc: 0.730599	valid's online_auc: 0.724748
[310]	train's auc: 0.750525	train's online_auc: 0.749314	valid's auc: 0.730597	valid's online_auc: 0.724742
[311]	train's auc: 0.750605	train's online_auc: 0.74936	valid's auc: 0.730591	valid's online_auc: 0.724726
[312]	train's auc: 0.750746	train's online_auc: 0.749526	valid's auc: 0.730687	valid's online_auc: 0.724841
[313]	train's auc: 0.750897	train's online_auc: 0.749688	valid's auc: 0.730721	valid's online_auc: 0.724823
[314]	train's auc: 0.750942	train's online_auc: 0.749747	valid's auc: 0.730733	valid's online_auc: 0.724862
[315]	train's auc: 0.750991	train's online_auc: 0.749781	valid's auc: 0.73074	valid's online_auc: 0.724865
[316]	train's auc: 0.75104	train's online_auc: 0.749857	valid's auc: 0.730723	valid's online_auc: 0.724824
[317]	train's auc: 0.751054	train's online_auc: 0.749867	valid's auc: 0.730718	valid's online_auc: 0.724817
[318]	train's auc: 0.751107	tra

[386]	train's auc: 0.754985	train's online_auc: 0.754312	valid's auc: 0.731547	valid's online_auc: 0.725655
[387]	train's auc: 0.755009	train's online_auc: 0.754349	valid's auc: 0.731541	valid's online_auc: 0.725631
[388]	train's auc: 0.755046	train's online_auc: 0.754408	valid's auc: 0.731577	valid's online_auc: 0.725697
[389]	train's auc: 0.755093	train's online_auc: 0.754456	valid's auc: 0.731569	valid's online_auc: 0.725695
[390]	train's auc: 0.755159	train's online_auc: 0.754492	valid's auc: 0.731587	valid's online_auc: 0.725681
[391]	train's auc: 0.755199	train's online_auc: 0.754599	valid's auc: 0.731571	valid's online_auc: 0.725659
[392]	train's auc: 0.755259	train's online_auc: 0.75465	valid's auc: 0.731571	valid's online_auc: 0.72565
[393]	train's auc: 0.755292	train's online_auc: 0.754685	valid's auc: 0.731575	valid's online_auc: 0.725646
[394]	train's auc: 0.755342	train's online_auc: 0.754751	valid's auc: 0.731589	valid's online_auc: 0.725662
[395]	train's auc: 0.755417	tr

In [7]:
log_file = 'feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)

In [9]:
df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid.astype(np.float32))
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)

online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

log_file = 'online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

Online AUC: 0.725787
Simple AUC: 0.731557


In [10]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    # del X_train
    del X_valid
    gc.collect()

[03:40:42] Finish cleaning memory. △M: -5.66GB. △T: 1.0 seconds.


In [22]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test2")
    cols_train_set = set(cols_train)
    mask = [i for i, col in enumerate(cols_test) if col in cols_train_set]
    X_test = sparse.csr_matrix(X_test[:, mask])
    gc.collect()
print("Test Data Shape: {}".format(X_test.shape))
print("Test Column Numbers: {}".format(len(cols_test)))

df_test = du.load_raw_data("test2")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]
    
subm_folder = '../subm/lgbm/{}'.format(version_name)
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)

[04:38:37] Finish loading testing data. △M: -661.99MB. △T: 8.3 seconds.
Test Data Shape: (2265879, 419701)
Test Column Numbers: 419702
[04:38:59] Finish making prediction on testing set. △M: +0B. △T: 19.3 seconds.


In [21]:
len(mask)

419701