In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES

In [3]:
# ================
# Data Preparation
# ================
# instantiate data loader
emb_loaders = (dp.EmbeddingDataManager.build_data(feat_name, "avg", 1) 
               for feat_name in user_multi_feat_names)
union_loader = dp.DataUnion(*emb_loaders)

with pu.profiler("loading training data"):
    cols_emb_train, X_emb_tv = union_loader.load("train")
    # X_emb_tv = sparse.csr_matrix(X_emb_tv)
    gc.collect()

print("Train Data Shape (Embedding): {}".format(X_emb_tv.shape))
print("Train Column Numbers (Embedding): {}".format(len(cols_emb_train)))

[01:26:11] Finish loading training data. △M: +15.21GB. △T: 2.3 minutes.
Train Data Shape (Embedding): (8798814, 464)
Train Column Numbers (Embedding): 464


In [4]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [5]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [6]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    # X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    X_train, X_valid = X_emb_tv[train_index, :], X_emb_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    # assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    assert X_train.shape[0] + X_valid.shape[0] == X_emb_tv.shape[0]
    
    # del X_tv
    del X_emb_tv
    gc.collect()

print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[01:26:31] Finish splitting train/valid set. △M: +120.85MB. △T: 13.9 seconds.
Training Set Size: (7039050, 464)
Validation Set Size: (1759764, 464)


In [7]:
with pu.profiler("preparing LightGBM data"):
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[01:26:50] Finish preparing LightGBM data. △M: +15.21GB. △T: 11.5 seconds.


In [8]:
with pu.profiler("releasing memory"):
    del X_train
    # del X_valid
    gc.collect()

[01:27:42] Finish releasing memory. △M: -15.21GB. △T: 1.3 seconds.


In [9]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0518/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [10]:
# v2 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 1000
# # v3 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 6,
#     'num_leaves': 64,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.5,
#     'bagging_fraction': 0.5,
#     'verbose': 0
# }
# num_rounds = 5000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.622823	train's online_auc: 0.640361	valid1's auc: 0.620361	valid1's online_auc: 0.636116
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.632134	train's online_auc: 0.650915	valid1's auc: 0.629681	valid1's online_auc: 0.647251
[3]	train's auc: 0.637704	train's online_auc: 0.65893	valid1's auc: 0.63425	valid1's online_auc: 0.65438
[4]	train's auc: 0.639175	train's online_auc: 0.660513	valid1's auc: 0.635457	valid1's online_auc: 0.655854
[5]	train's auc: 0.641321	train's online_auc: 0.663465	valid1's auc: 0.637521	valid1's online_auc: 0.658102
[6]	train's auc: 0.641966	train's online_auc: 0.66411	valid1's auc: 0.638117	valid1's online_auc: 0.65865
[7]	train's auc: 0.64355	train's online_auc: 0.665303	valid1's auc: 0.639642	valid1's online_auc: 0.65979
[8]	train's auc: 0.645657	train's online_auc: 0.668227	valid1's auc: 0.641907	valid1's online_auc: 0.662213
[9]	train's auc: 0.646757	train's online_auc: 0.66959	valid1's auc: 0.643022	vali

[77]	train's auc: 0.69002	train's online_auc: 0.709138	valid1's auc: 0.666212	valid1's online_auc: 0.684588
[78]	train's auc: 0.690414	train's online_auc: 0.709549	valid1's auc: 0.666275	valid1's online_auc: 0.684619
[79]	train's auc: 0.69083	train's online_auc: 0.709988	valid1's auc: 0.666303	valid1's online_auc: 0.684628
[80]	train's auc: 0.69122	train's online_auc: 0.710305	valid1's auc: 0.66633	valid1's online_auc: 0.684661
[81]	train's auc: 0.69168	train's online_auc: 0.710703	valid1's auc: 0.666344	valid1's online_auc: 0.684643
[82]	train's auc: 0.692059	train's online_auc: 0.711091	valid1's auc: 0.666365	valid1's online_auc: 0.6847
[83]	train's auc: 0.692507	train's online_auc: 0.711567	valid1's auc: 0.666385	valid1's online_auc: 0.684735
[84]	train's auc: 0.69293	train's online_auc: 0.712028	valid1's auc: 0.666501	valid1's online_auc: 0.68499
[85]	train's auc: 0.693385	train's online_auc: 0.712521	valid1's auc: 0.666538	valid1's online_auc: 0.684969
[86]	train's auc: 0.693743	t

[153]	train's auc: 0.716285	train's online_auc: 0.735368	valid1's auc: 0.666745	valid1's online_auc: 0.684865
[154]	train's auc: 0.716591	train's online_auc: 0.735688	valid1's auc: 0.666769	valid1's online_auc: 0.684943
[155]	train's auc: 0.716878	train's online_auc: 0.735957	valid1's auc: 0.666767	valid1's online_auc: 0.684941
[156]	train's auc: 0.717022	train's online_auc: 0.736129	valid1's auc: 0.666715	valid1's online_auc: 0.684871
[157]	train's auc: 0.717296	train's online_auc: 0.736358	valid1's auc: 0.666696	valid1's online_auc: 0.684879
[158]	train's auc: 0.717648	train's online_auc: 0.736722	valid1's auc: 0.666717	valid1's online_auc: 0.684931
[159]	train's auc: 0.717997	train's online_auc: 0.737094	valid1's auc: 0.666722	valid1's online_auc: 0.684899
[160]	train's auc: 0.718347	train's online_auc: 0.737466	valid1's auc: 0.666738	valid1's online_auc: 0.684861
[161]	train's auc: 0.718774	train's online_auc: 0.737843	valid1's auc: 0.666682	valid1's online_auc: 0.684765
[162]	trai

In [12]:
# these will cause LightGBM bug I don't know how to fix. Just skip it
log_file = 'v1.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

# df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = pd.DataFrame({"feature": cols_emb_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
147,kw2_embedding_20,234
156,kw2_embedding_29,158
151,kw2_embedding_24,153
165,kw2_embedding_38,152
322,topic2_embedding_45,151
314,topic2_embedding_37,144
296,topic2_embedding_19,134
146,kw2_embedding_19,134
175,kw2_embedding_48,133
41,interest2_embedding_8,133


In [16]:
with pu.profiler("preparing valid set"):
    cols_emb_train, X_emb_tv = union_loader.load("train")
    X_valid = X_emb_tv[valid_index, :]
    del X_emb_tv
    gc.collect()

[02:31:13] Finish preparing valid set. △M: +3.06GB. △T: 2.4 minutes.


In [17]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

[02:31:30] Finish making prediction on validation set. △M: +12.0KB. △T: 7.0 seconds.


In [18]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.685241
Simple AUC: 0.666967


In [19]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    # del X_train
    del X_valid
    gc.collect()

[02:35:07] Finish cleaning memory. △M: -6.19GB. △T: 0.7 seconds.


In [17]:
with pu.profiler("loading testing data"):
    cols_test, X_test = union_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()

print("Test Data Shape: {}".format(X_test.shape))
# print("Test Column Numbers: {}".format(len(cols_test)))

[15:23:44] Finish loading testing data. △M: +1.6GB. △T: 1.2 minutes.
[15:24:12] Finish trimming testing set. △M: -1.22GB. △T: 27.6 seconds.
Test Data Shape (After masking): (2265989, 1791142)


In [18]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[15:25:56] Finish making prediction on testing set. △M: -5.14GB. △T: 1.5 minutes.


In [19]:
subm_folder = '../../../subm/lgbm/0518_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)