In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
from contextlib import redirect_stdout
from itertools import compress
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import eval_utils as eu
import io_utils as iu
import config

In [2]:
# ================
# Data Preparation
# ================
# instantiate data loader
dm = dp.DataManager(config.INPUT_DIR)
bin_loader = dm.build_data("raw", "tfidf")

with pu.profiler("loading training data"):
    cols_train, X_tv = bin_loader.load("train")
    X_tv = sparse.csr_matrix(X_tv)
    gc.collect()

print("Train Data Shape: {}".format(X_tv.shape))
print("Train Column Numbers: {}".format(len(cols_train)))

[09:31:29] Finish loading training data. △M: +5.2GB. △T: 7.6 seconds.
Train Data Shape: (8798814, 419701)
Train Column Numbers: 419701


In [3]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [4]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)  # for reproducibility
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 5  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [5]:
aids = df_train['aid'].values
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X_tv[train_index, :], X_tv[valid_index, :]
    y_train, y_valid = y[train_index], y[valid_index]
    aids_train, aids_valid = aids[train_index], aids[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X_tv.shape[0]
    
    del X_tv
    gc.collect()

print("Training Set Size (Before masking): {}".format(X_train.shape))
print("Validation Set Size (Before masking): {}".format(X_valid.shape))

[09:32:01] Finish splitting train/valid set. △M: +167.85MB. △T: 21.7 seconds.
Training Set Size (Before masking): (7039050, 419701)
Validation Set Size (Before masking): (1759764, 419701)


In [6]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols_train)  # cause bugs I don't know how to fix
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols_train)
    # lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    # lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)
    gc.collect()

[09:32:09] Finish preparing LightGBM data. △M: +5.14GB. △T: 4.1 seconds.


In [7]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0516/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [8]:
# v2 parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 1000
# v3 parameters
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'max_depth': 6,
#     'num_leaves': 64,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.5,
#     'bagging_fraction': 0.5,
#     'verbose': 0
# }
# num_rounds = 5000
eval_auc = eu.build_lightgbm_online_auc_eval(aids_train, aids_valid)

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     feval = eval_auc,
                     early_stopping_rounds=50)

[1]	train's auc: 0.64974	train's online_auc: 0.638729	valid1's auc: 0.650713	valid1's online_auc: 0.640616
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.670797	train's online_auc: 0.656547	valid1's auc: 0.670306	valid1's online_auc: 0.657196
[3]	train's auc: 0.675547	train's online_auc: 0.665327	valid1's auc: 0.674841	valid1's online_auc: 0.666197
[4]	train's auc: 0.678111	train's online_auc: 0.669026	valid1's auc: 0.677255	valid1's online_auc: 0.669059
[5]	train's auc: 0.680935	train's online_auc: 0.672471	valid1's auc: 0.680115	valid1's online_auc: 0.672132
[6]	train's auc: 0.681814	train's online_auc: 0.673578	valid1's auc: 0.680659	valid1's online_auc: 0.672895
[7]	train's auc: 0.682585	train's online_auc: 0.675504	valid1's auc: 0.681417	valid1's online_auc: 0.674852
[8]	train's auc: 0.683799	train's online_auc: 0.676925	valid1's auc: 0.682503	valid1's online_auc: 0.676213
[9]	train's auc: 0.685821	train's online_auc: 0.679821	valid1's auc: 0.6845

[76]	train's auc: 0.739442	train's online_auc: 0.736329	valid1's auc: 0.725884	valid1's online_auc: 0.719671
[77]	train's auc: 0.739965	train's online_auc: 0.736971	valid1's auc: 0.726183	valid1's online_auc: 0.720021
[78]	train's auc: 0.740372	train's online_auc: 0.737609	valid1's auc: 0.726229	valid1's online_auc: 0.720025
[79]	train's auc: 0.740683	train's online_auc: 0.737908	valid1's auc: 0.726328	valid1's online_auc: 0.720107
[80]	train's auc: 0.740994	train's online_auc: 0.73823	valid1's auc: 0.726422	valid1's online_auc: 0.720181
[81]	train's auc: 0.741393	train's online_auc: 0.73868	valid1's auc: 0.726513	valid1's online_auc: 0.720344
[82]	train's auc: 0.741645	train's online_auc: 0.738899	valid1's auc: 0.726604	valid1's online_auc: 0.720459
[83]	train's auc: 0.74204	train's online_auc: 0.739444	valid1's auc: 0.726667	valid1's online_auc: 0.720542
[84]	train's auc: 0.742555	train's online_auc: 0.740301	valid1's auc: 0.726848	valid1's online_auc: 0.720965
[85]	train's auc: 0.74

[152]	train's auc: 0.76427	train's online_auc: 0.765832	valid1's auc: 0.731825	valid1's online_auc: 0.726294
[153]	train's auc: 0.76449	train's online_auc: 0.766078	valid1's auc: 0.731824	valid1's online_auc: 0.7263
[154]	train's auc: 0.764698	train's online_auc: 0.766244	valid1's auc: 0.731843	valid1's online_auc: 0.726309
[155]	train's auc: 0.764968	train's online_auc: 0.76654	valid1's auc: 0.731835	valid1's online_auc: 0.726292
[156]	train's auc: 0.765229	train's online_auc: 0.766823	valid1's auc: 0.731827	valid1's online_auc: 0.726281
[157]	train's auc: 0.765468	train's online_auc: 0.767093	valid1's auc: 0.73183	valid1's online_auc: 0.726242
[158]	train's auc: 0.765785	train's online_auc: 0.767296	valid1's auc: 0.73186	valid1's online_auc: 0.726238
[159]	train's auc: 0.76608	train's online_auc: 0.767619	valid1's auc: 0.731898	valid1's online_auc: 0.726212
[160]	train's auc: 0.766249	train's online_auc: 0.76785	valid1's auc: 0.731907	valid1's online_auc: 0.72622
[161]	train's auc: 0

[227]	train's auc: 0.781707	train's online_auc: 0.785733	valid1's auc: 0.733539	valid1's online_auc: 0.728076
[228]	train's auc: 0.781896	train's online_auc: 0.785902	valid1's auc: 0.733536	valid1's online_auc: 0.728069
[229]	train's auc: 0.782082	train's online_auc: 0.786085	valid1's auc: 0.733578	valid1's online_auc: 0.728091
[230]	train's auc: 0.782236	train's online_auc: 0.786192	valid1's auc: 0.733597	valid1's online_auc: 0.728106
[231]	train's auc: 0.782381	train's online_auc: 0.786369	valid1's auc: 0.733622	valid1's online_auc: 0.728131
[232]	train's auc: 0.78253	train's online_auc: 0.78652	valid1's auc: 0.733618	valid1's online_auc: 0.728108
[233]	train's auc: 0.78265	train's online_auc: 0.786692	valid1's auc: 0.733618	valid1's online_auc: 0.728128
[234]	train's auc: 0.782796	train's online_auc: 0.786798	valid1's auc: 0.733619	valid1's online_auc: 0.728137
[235]	train's auc: 0.782975	train's online_auc: 0.786962	valid1's auc: 0.733629	valid1's online_auc: 0.728165
[236]	train's

In [9]:
# these will cause LightGBM bug I don't know how to fix. Just skip it
log_file = 'v1.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols_train, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419614,creativeSize_59,288
915,interest1_49,220
962,interest1_36,191
902,interest1_70,182
1057,interest2_54,172
419610,creativeSize_22,167
3,age_2,157
419699,productType_9,155
1002,interest1_76,141
419207,ct_1,141


In [10]:
with pu.profiler("making prediction on validation set"):
    df_valid = df_train.iloc[valid_index]
    proba_valid = lgbm.predict(X_valid.astype(np.float32))

log_file = 'v1.online_auc.csv'
log_path = os.path.join(log_folder, log_file)
df_score = eu.online_auc(df_valid['aid'], y_valid, proba_valid, ret_verbose=True)
df_score.rename(columns={'selector': 'aid'}, inplace=True)
df_score = df_score[['aid', 'auc']]  # sort columns
df_score = df_score.sort_values("auc", ascending=False)
df_score.to_csv(log_path, index=False)

[11:11:50] Finish making prediction on validation set. △M: +20.0KB. △T: 19.7 seconds.


In [11]:
online_auc = df_score['auc'].mean()
simple_auc = metrics.roc_auc_score(y_valid, proba_valid)
print("Online AUC: {:.6f}".format(online_auc))
print("Simple AUC: {:.6f}".format(simple_auc))

Online AUC: 0.728216
Simple AUC: 0.733727


In [12]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[11:11:53] Finish cleaning memory. △M: -11.12GB. △T: 1.5 seconds.


In [13]:
with pu.profiler("loading testing data"):
    cols_test, X_test = bin_loader.load("test1")
    X_test = sparse.csr_matrix(X_test)
    gc.collect()
    
print("Test Data Shape: {}".format(X_test.shape))

[11:11:57] Finish loading testing data. △M: +702.55MB. △T: 2.3 seconds.
Test Data Shape: (2265989, 419701)


In [14]:
df_test = du.load_raw_data("test")
X_test = X_test.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X_test)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[11:12:26] Finish making prediction on testing set. △M: -255.25MB. △T: 23.6 seconds.


In [15]:
subm_folder = '../../../subm/lgbm/0516_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)