In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
single_ckr_loader = dm.build_data("raw", "clickStats_v1")
union_loader = dp.DataUnion(bin_loader, single_ckr_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[12:10:15] Finish loading training data. △M: +5.58GB. △T: 53.4 seconds.
Train Data Shape: (8798814, 419707)
Train Column Numbers: 419707


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
n_splits = 3
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

# n_splits = 3  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[12:10:51] Finish splitting train/valid set. △M: +79.92MB. △T: 25.7 seconds.
Training Set Size: (5865876, 419707)
Validation Set Size: (2932938, 419707)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    gc.collect()

[12:10:55] Finish preparing LightGBM data. △M: +5.52GB. △T: 4.1 seconds.


In [7]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0507/')
log_file = 'v2.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [8]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     early_stopping_rounds=50)

[1]	train's auc: 0.654215	valid1's auc: 0.652693
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.671872	valid1's auc: 0.670321
[3]	train's auc: 0.677278	valid1's auc: 0.676076
[4]	train's auc: 0.679101	valid1's auc: 0.677795
[5]	train's auc: 0.680855	valid1's auc: 0.679557
[6]	train's auc: 0.681079	valid1's auc: 0.679659
[7]	train's auc: 0.681271	valid1's auc: 0.679729
[8]	train's auc: 0.681211	valid1's auc: 0.679711
[9]	train's auc: 0.681966	valid1's auc: 0.680463
[10]	train's auc: 0.682398	valid1's auc: 0.680991
[11]	train's auc: 0.683326	valid1's auc: 0.68181
[12]	train's auc: 0.685141	valid1's auc: 0.683478
[13]	train's auc: 0.686632	valid1's auc: 0.684822
[14]	train's auc: 0.688212	valid1's auc: 0.686109
[15]	train's auc: 0.690094	valid1's auc: 0.688147
[16]	train's auc: 0.690947	valid1's auc: 0.688852
[17]	train's auc: 0.692854	valid1's auc: 0.690502
[18]	train's auc: 0.694678	valid1's auc: 0.692233
[19]	train's auc: 0.696398	valid1's auc: 0.69389

[164]	train's auc: 0.765729	valid1's auc: 0.732106
[165]	train's auc: 0.765992	valid1's auc: 0.732186
[166]	train's auc: 0.766163	valid1's auc: 0.732198
[167]	train's auc: 0.766486	valid1's auc: 0.732234
[168]	train's auc: 0.766752	valid1's auc: 0.732282
[169]	train's auc: 0.766972	valid1's auc: 0.732278
[170]	train's auc: 0.767086	valid1's auc: 0.73228
[171]	train's auc: 0.767293	valid1's auc: 0.732314
[172]	train's auc: 0.767575	valid1's auc: 0.732466
[173]	train's auc: 0.7678	valid1's auc: 0.732488
[174]	train's auc: 0.768264	valid1's auc: 0.732724
[175]	train's auc: 0.768483	valid1's auc: 0.732763
[176]	train's auc: 0.768661	valid1's auc: 0.732777
[177]	train's auc: 0.768847	valid1's auc: 0.732772
[178]	train's auc: 0.769047	valid1's auc: 0.732762
[179]	train's auc: 0.769329	valid1's auc: 0.732787
[180]	train's auc: 0.769635	valid1's auc: 0.732776
[181]	train's auc: 0.769999	valid1's auc: 0.732868
[182]	train's auc: 0.770232	valid1's auc: 0.732946
[183]	train's auc: 0.770447	valid1

[326]	train's auc: 0.79415	valid1's auc: 0.734168
[327]	train's auc: 0.794264	valid1's auc: 0.734157
[328]	train's auc: 0.794388	valid1's auc: 0.734143
[329]	train's auc: 0.794517	valid1's auc: 0.734137
[330]	train's auc: 0.794652	valid1's auc: 0.734141
[331]	train's auc: 0.794781	valid1's auc: 0.734164
[332]	train's auc: 0.794936	valid1's auc: 0.734157
[333]	train's auc: 0.795068	valid1's auc: 0.734158
[334]	train's auc: 0.795215	valid1's auc: 0.734169
[335]	train's auc: 0.795429	valid1's auc: 0.734222
[336]	train's auc: 0.795556	valid1's auc: 0.734227
[337]	train's auc: 0.795746	valid1's auc: 0.734207
[338]	train's auc: 0.795904	valid1's auc: 0.734186
[339]	train's auc: 0.796057	valid1's auc: 0.734204
[340]	train's auc: 0.796152	valid1's auc: 0.734196
[341]	train's auc: 0.796267	valid1's auc: 0.734201
[342]	train's auc: 0.796401	valid1's auc: 0.734215
[343]	train's auc: 0.796453	valid1's auc: 0.734205
[344]	train's auc: 0.79663	valid1's auc: 0.734197
[345]	train's auc: 0.796854	valid

In [9]:
log_file = 'v2.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419706,bsClickrate@LBS,953
419701,bsClickrate@aid,752
419703,bsClickrate@age,635
419705,bsClickrate@education,535
419702,bsClickrate@campaignId,505
419704,bsClickrate@consumptionAbility,367
419614,creativeSize_59,307
419699,productType_9,215
419610,creativeSize_22,185
419611,creativeSize_35,158


In [11]:
df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score

Unnamed: 0,aid,auc
0,6,0.646918
1,7,0.811317
2,12,0.857545
3,18,0.579454
4,70,0.864512
5,74,0.663195
6,86,0.600669
7,98,0.786241
8,113,0.529993
9,117,0.651377


In [12]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.726907
Simple AUC: 0.734301


In [13]:
log_file = 'v2.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score.to_csv(log_path, index=False)

In [14]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[12:57:06] Finish cleaning memory. △M: -12.04GB. △T: 1.7 seconds.


In [15]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()

print("Train Data Shape: {}".format(X_test.shape))
print("Train Column Numbers: {}".format(len(cols_test)))

[12:57:33] Finish loading testing data. △M: +754.65MB. △T: 9.1 seconds.
Train Data Shape: (2265989, 419707)
Train Column Numbers: 419707


In [16]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[12:58:03] Finish making prediction on testing set. △M: +0B. △T: 28.8 seconds.


In [17]:
subm_folder = '../subm/lgbm/0507_v2'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)

In [19]:
df_score.sort_values("auc", ascending=False)

Unnamed: 0,aid,auc
161,2054,0.947292
44,613,0.936784
126,1672,0.932849
39,519,0.930475
149,1957,0.922272
145,1930,0.919408
54,725,0.917422
35,450,0.914098
102,1375,0.906421
48,671,0.901852
