In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../code/pipeline/')
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES

In [3]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[1]

df_all = pd.concat([df_train, df_test], ignore_index=True)
df_all = pd.merge(df_all, df_ad, on="aid", how="left")
print("Data Shape: {}".format(df_all.shape))
print("NaN Count: {}".format(df_all.isnull().sum().sum()))

y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

Data Shape: (11064803, 10)
NaN Count: 2265989


In [6]:
with pu.profiler("loading training data"):
    X_all, cols = du.quick_join(df_all, user_feat_names=user_multi_feat_names)
    X_all = sparse.csr_matrix(X_all)
    
    X_tv = X_all[:train_size]
    X_test = X_all[train_size:]
    
    X_tv = sparse.csr_matrix(X_tv)
    X_test = sparse.csr_matrix(X_test)
    
    del X_all
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Test Data Shape: {}".format(X_test.shape))
print("Column Numbers: {}".format(len(cols)))

[02:57:03] Finish loading training data. △M: +3.35GB. △T: 24.8 seconds.
Train Data Shape: (8798814, 418337)
Test Data Shape: (2265989, 418337)
Column Numbers: 418337


In [7]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [8]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[02:57:39] Finish splitting train/valid set. △M: +181.26MB. △T: 14.1 seconds.
Training Set Size: (7039050, 418337)
Validation Set Size: (1759764, 418337)


In [9]:
with pu.profiler("preparing LightGBM data"):
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[02:57:47] Finish preparing LightGBM data. △M: +4.21GB. △T: 7.6 seconds.


In [11]:
with pu.profiler("releasing memory"):
    del X_train
    # del X_valid
    gc.collect()

[02:58:07] Finish releasing memory. △M: -2.1GB. △T: 0.3 seconds.


In [12]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0518/')
log_file = 'v2.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [13]:
# v2 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 1000
# # v3 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 6,
#     'num_leaves': 64,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.5,
#     'bagging_fraction': 0.5,
#     'verbose': 0
# }
# num_rounds = 5000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.627667	train's online_auc: 0.649963	valid1's auc: 0.626556	valid1's online_auc: 0.647148
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.632736	train's online_auc: 0.656434	valid1's auc: 0.631918	valid1's online_auc: 0.65369
[3]	train's auc: 0.636853	train's online_auc: 0.660506	valid1's auc: 0.635662	valid1's online_auc: 0.657957
[4]	train's auc: 0.637009	train's online_auc: 0.660582	valid1's auc: 0.635594	valid1's online_auc: 0.65765
[5]	train's auc: 0.638223	train's online_auc: 0.661375	valid1's auc: 0.636831	valid1's online_auc: 0.65858
[6]	train's auc: 0.639596	train's online_auc: 0.663167	valid1's auc: 0.638121	valid1's online_auc: 0.660672
[7]	train's auc: 0.641027	train's online_auc: 0.665574	valid1's auc: 0.639721	valid1's online_auc: 0.663459
[8]	train's auc: 0.642132	train's online_auc: 0.666164	valid1's auc: 0.640929	valid1's online_auc: 0.664325
[9]	train's auc: 0.642549	train's online_auc: 0.66651	valid1's auc: 0.641308	

[77]	train's auc: 0.683883	train's online_auc: 0.704395	valid1's auc: 0.668149	valid1's online_auc: 0.687243
[78]	train's auc: 0.684261	train's online_auc: 0.7047	valid1's auc: 0.668255	valid1's online_auc: 0.687253
[79]	train's auc: 0.684538	train's online_auc: 0.704909	valid1's auc: 0.668358	valid1's online_auc: 0.687358
[80]	train's auc: 0.684867	train's online_auc: 0.705179	valid1's auc: 0.668415	valid1's online_auc: 0.687394
[81]	train's auc: 0.685345	train's online_auc: 0.705628	valid1's auc: 0.668459	valid1's online_auc: 0.68743
[82]	train's auc: 0.685843	train's online_auc: 0.70609	valid1's auc: 0.668566	valid1's online_auc: 0.687469
[83]	train's auc: 0.686076	train's online_auc: 0.706336	valid1's auc: 0.668663	valid1's online_auc: 0.687574
[84]	train's auc: 0.68641	train's online_auc: 0.706615	valid1's auc: 0.668721	valid1's online_auc: 0.687606
[85]	train's auc: 0.686762	train's online_auc: 0.706943	valid1's auc: 0.668755	valid1's online_auc: 0.687621
[86]	train's auc: 0.6870

[152]	train's auc: 0.70457	train's online_auc: 0.724203	valid1's auc: 0.671633	valid1's online_auc: 0.68994
[153]	train's auc: 0.704853	train's online_auc: 0.724501	valid1's auc: 0.671637	valid1's online_auc: 0.689899
[154]	train's auc: 0.705015	train's online_auc: 0.724703	valid1's auc: 0.671658	valid1's online_auc: 0.689895
[155]	train's auc: 0.705249	train's online_auc: 0.72492	valid1's auc: 0.671678	valid1's online_auc: 0.689988
[156]	train's auc: 0.705417	train's online_auc: 0.725051	valid1's auc: 0.671723	valid1's online_auc: 0.690007
[157]	train's auc: 0.705653	train's online_auc: 0.725309	valid1's auc: 0.671703	valid1's online_auc: 0.689991
[158]	train's auc: 0.706009	train's online_auc: 0.725679	valid1's auc: 0.671829	valid1's online_auc: 0.690204
[159]	train's auc: 0.706188	train's online_auc: 0.725842	valid1's auc: 0.671879	valid1's online_auc: 0.69029
[160]	train's auc: 0.706395	train's online_auc: 0.72601	valid1's auc: 0.671864	valid1's online_auc: 0.690292
[161]	train's a

[227]	train's auc: 0.719539	train's online_auc: 0.738851	valid1's auc: 0.672388	valid1's online_auc: 0.690602
[228]	train's auc: 0.719754	train's online_auc: 0.739078	valid1's auc: 0.672407	valid1's online_auc: 0.690604
[229]	train's auc: 0.71996	train's online_auc: 0.739315	valid1's auc: 0.67242	valid1's online_auc: 0.690609
[230]	train's auc: 0.720037	train's online_auc: 0.739386	valid1's auc: 0.672423	valid1's online_auc: 0.690616
[231]	train's auc: 0.720225	train's online_auc: 0.739557	valid1's auc: 0.672406	valid1's online_auc: 0.690595
[232]	train's auc: 0.720417	train's online_auc: 0.739735	valid1's auc: 0.672415	valid1's online_auc: 0.690635
[233]	train's auc: 0.720582	train's online_auc: 0.73988	valid1's auc: 0.67242	valid1's online_auc: 0.690634
[234]	train's auc: 0.720828	train's online_auc: 0.740074	valid1's auc: 0.672388	valid1's online_auc: 0.690594
[235]	train's auc: 0.720965	train's online_auc: 0.740212	valid1's auc: 0.672366	valid1's online_auc: 0.690587
[236]	train's 

[302]	train's auc: 0.73137	train's online_auc: 0.750778	valid1's auc: 0.672497	valid1's online_auc: 0.69118
[303]	train's auc: 0.731496	train's online_auc: 0.750915	valid1's auc: 0.672502	valid1's online_auc: 0.691165
[304]	train's auc: 0.731714	train's online_auc: 0.75113	valid1's auc: 0.672511	valid1's online_auc: 0.691142
[305]	train's auc: 0.731882	train's online_auc: 0.751304	valid1's auc: 0.672485	valid1's online_auc: 0.691121
[306]	train's auc: 0.732008	train's online_auc: 0.751426	valid1's auc: 0.672464	valid1's online_auc: 0.691121
[307]	train's auc: 0.732187	train's online_auc: 0.751651	valid1's auc: 0.672478	valid1's online_auc: 0.691154
[308]	train's auc: 0.732362	train's online_auc: 0.751804	valid1's auc: 0.672475	valid1's online_auc: 0.691137
[309]	train's auc: 0.732539	train's online_auc: 0.751969	valid1's auc: 0.672507	valid1's online_auc: 0.691191
[310]	train's auc: 0.732604	train's online_auc: 0.752033	valid1's auc: 0.672497	valid1's online_auc: 0.691158
[311]	train's

In [18]:
# these will cause LightGBM bug I don't know how to fix. Just skip it
log_file = 'v2.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
2,marriageStatus_10,218
418331,ct_1,199
126,interest1_76,177
181,interest2_54,162
8,marriageStatus_11,162
269,interest5_42,142
338,interest5_129,141
298,interest5_52,140
33,interest1_75,137
216,interest2_[nan],136


In [19]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

log_file = 'v2.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

[04:53:08] Finish making prediction on validation set. △M: +4.0KB. △T: 20.4 seconds.


In [20]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.691313
Simple AUC: 0.672657


In [21]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    # del X_train
    del X_valid
    gc.collect()

[04:53:11] Finish cleaning memory. △M: -6.45GB. △T: 1.0 seconds.


In [22]:
print("Test Data Shape: {}".format(X_test.shape))
# print("Test Column Numbers: {}".format(len(cols_test)))

Test Data Shape: (2265989, 418337)


In [23]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[04:54:13] Finish making prediction on testing set. △M: +0B. △T: 23.7 seconds.


In [24]:
subm_folder = '../subm/lgbm/0518_v2'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)