In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# load multiple data and stack them together
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "binary")
cross_ckr_loader = dm.build_data("cross", "clickStats_v1")
union_loader = dp.DataUnion(bin_loader, cross_ckr_loader)

with pu.profiler("loading training data"):
    cols_train, X_tv = union_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[01:47:26] Finish loading training data. △M: +5.71GB. △T: 1.1 minutes.
Train Data Shape: (8798814, 419709)
Train Column Numbers: 419709


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
n_splits = 3
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

# n_splits = 3  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[01:48:10] Finish splitting train/valid set. △M: +124.67MB. △T: 32.3 seconds.
Training Set Size: (5865876, 419709)
Validation Set Size: (2932938, 419709)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    gc.collect()

[01:48:16] Finish preparing LightGBM data. △M: +5.67GB. △T: 6.5 seconds.


In [7]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0509/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [8]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 1000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.664869	train's online_auc: 0.659378	valid1's auc: 0.663328	valid1's online_auc: 0.656703
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.684749	train's online_auc: 0.679979	valid1's auc: 0.683192	valid1's online_auc: 0.677252
[3]	train's auc: 0.686767	train's online_auc: 0.682469	valid1's auc: 0.68536	valid1's online_auc: 0.680064
[4]	train's auc: 0.687861	train's online_auc: 0.683758	valid1's auc: 0.686481	valid1's online_auc: 0.681788
[5]	train's auc: 0.688993	train's online_auc: 0.684751	valid1's auc: 0.687545	valid1's online_auc: 0.682678
[6]	train's auc: 0.690176	train's online_auc: 0.68604	valid1's auc: 0.688859	valid1's online_auc: 0.683778
[7]	train's auc: 0.691042	train's online_auc: 0.687622	valid1's auc: 0.689639	valid1's online_auc: 0.684977
[8]	train's auc: 0.693546	train's online_auc: 0.690625	valid1's auc: 0.691963	valid1's online_auc: 0.688022
[9]	train's auc: 0.694649	train's online_auc: 0.691482	valid1's auc: 0.69309

[76]	train's auc: 0.743492	train's online_auc: 0.742279	valid1's auc: 0.728594	valid1's online_auc: 0.724043
[77]	train's auc: 0.743979	train's online_auc: 0.742699	valid1's auc: 0.728865	valid1's online_auc: 0.72416
[78]	train's auc: 0.744439	train's online_auc: 0.743284	valid1's auc: 0.729048	valid1's online_auc: 0.724315
[79]	train's auc: 0.744905	train's online_auc: 0.743929	valid1's auc: 0.729158	valid1's online_auc: 0.724412
[80]	train's auc: 0.745199	train's online_auc: 0.74436	valid1's auc: 0.729271	valid1's online_auc: 0.724531
[81]	train's auc: 0.745524	train's online_auc: 0.744857	valid1's auc: 0.729377	valid1's online_auc: 0.724638
[82]	train's auc: 0.745811	train's online_auc: 0.745214	valid1's auc: 0.729463	valid1's online_auc: 0.724779
[83]	train's auc: 0.746183	train's online_auc: 0.745595	valid1's auc: 0.729631	valid1's online_auc: 0.724967
[84]	train's auc: 0.746709	train's online_auc: 0.746275	valid1's auc: 0.729839	valid1's online_auc: 0.725188
[85]	train's auc: 0.7

[152]	train's auc: 0.765853	train's online_auc: 0.767924	valid1's auc: 0.734469	valid1's online_auc: 0.729712
[153]	train's auc: 0.766025	train's online_auc: 0.768125	valid1's auc: 0.734485	valid1's online_auc: 0.729756
[154]	train's auc: 0.766191	train's online_auc: 0.768572	valid1's auc: 0.734485	valid1's online_auc: 0.729748
[155]	train's auc: 0.766504	train's online_auc: 0.768992	valid1's auc: 0.734563	valid1's online_auc: 0.729848
[156]	train's auc: 0.766714	train's online_auc: 0.769243	valid1's auc: 0.734573	valid1's online_auc: 0.729853
[157]	train's auc: 0.766911	train's online_auc: 0.769403	valid1's auc: 0.734605	valid1's online_auc: 0.729879
[158]	train's auc: 0.767153	train's online_auc: 0.769734	valid1's auc: 0.734612	valid1's online_auc: 0.72993
[159]	train's auc: 0.767378	train's online_auc: 0.769987	valid1's auc: 0.734619	valid1's online_auc: 0.729931
[160]	train's auc: 0.767566	train's online_auc: 0.770166	valid1's auc: 0.734661	valid1's online_auc: 0.729934
[161]	train

[227]	train's auc: 0.780854	train's online_auc: 0.785501	valid1's auc: 0.735857	valid1's online_auc: 0.730559
[228]	train's auc: 0.781124	train's online_auc: 0.785812	valid1's auc: 0.735918	valid1's online_auc: 0.730633
[229]	train's auc: 0.781337	train's online_auc: 0.786019	valid1's auc: 0.735918	valid1's online_auc: 0.730662
[230]	train's auc: 0.781495	train's online_auc: 0.786252	valid1's auc: 0.735908	valid1's online_auc: 0.730635
[231]	train's auc: 0.781645	train's online_auc: 0.786449	valid1's auc: 0.735938	valid1's online_auc: 0.730667
[232]	train's auc: 0.78174	train's online_auc: 0.786595	valid1's auc: 0.735949	valid1's online_auc: 0.730672
[233]	train's auc: 0.781931	train's online_auc: 0.786779	valid1's auc: 0.735992	valid1's online_auc: 0.730702
[234]	train's auc: 0.782082	train's online_auc: 0.78692	valid1's auc: 0.735997	valid1's online_auc: 0.730685
[235]	train's auc: 0.782244	train's online_auc: 0.787175	valid1's auc: 0.736034	valid1's online_auc: 0.73073
[236]	train's

[302]	train's auc: 0.793016	train's online_auc: 0.799538	valid1's auc: 0.736102	valid1's online_auc: 0.73078
[303]	train's auc: 0.793101	train's online_auc: 0.799618	valid1's auc: 0.736112	valid1's online_auc: 0.7308
[304]	train's auc: 0.793194	train's online_auc: 0.799752	valid1's auc: 0.736117	valid1's online_auc: 0.730799
[305]	train's auc: 0.793419	train's online_auc: 0.800032	valid1's auc: 0.736102	valid1's online_auc: 0.73079
[306]	train's auc: 0.793637	train's online_auc: 0.800296	valid1's auc: 0.736096	valid1's online_auc: 0.73078
[307]	train's auc: 0.793734	train's online_auc: 0.800471	valid1's auc: 0.736088	valid1's online_auc: 0.730769
[308]	train's auc: 0.794097	train's online_auc: 0.800883	valid1's auc: 0.736165	valid1's online_auc: 0.730853
[309]	train's auc: 0.794275	train's online_auc: 0.801096	valid1's auc: 0.736164	valid1's online_auc: 0.73085
[310]	train's auc: 0.794338	train's online_auc: 0.80116	valid1's auc: 0.736168	valid1's online_auc: 0.730851
[311]	train's auc

In [9]:
log_file = 'v1.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419701,bsClickrate@aid_x_age,1185
419707,bsClickrate@aid_x_LBS,1005
419705,bsClickrate@aid_x_consumptionAbility,993
419703,bsClickrate@aid_x_education,886
419708,impression@aid_x_LBS,761
419702,impression@aid_x_age,623
419706,impression@aid_x_consumptionAbility,564
419704,impression@aid_x_education,554
419614,creativeSize_59,279
419610,creativeSize_22,187


In [10]:
df_valid = df_train.iloc[valid_index]
proba_valid = lgbm.predict(X_valid)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score

Unnamed: 0,aid,auc
0,6,0.657286
1,7,0.837813
2,12,0.854940
3,18,0.568405
4,70,0.865752
5,74,0.677688
6,86,0.610922
7,98,0.789088
8,113,0.552108
9,117,0.657114


In [11]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.730853
Simple AUC: 0.736165


In [12]:
log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

In [13]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[03:20:21] Finish cleaning memory. △M: -12.06GB. △T: 1.8 seconds.


In [14]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()

print("Train Data Shape: {}".format(X_test.shape))
print("Train Column Numbers: {}".format(len(cols_test)))

[03:20:32] Finish loading testing data. △M: +771.36MB. △T: 10.1 seconds.
Train Data Shape: (2265989, 419709)
Train Column Numbers: 419709


In [15]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[03:21:09] Finish making prediction on testing set. △M: +0B. △T: 35.7 seconds.


In [16]:
subm_folder = '../subm/lgbm/0509_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)