In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../code/utils/')
sys.path.append('../code/')
import data_utils as du
import perf_utils as pu
import config

In [2]:
class DuplicatedLogger(object):
    """Lumberjack class - duplicates sys.stdout to a log file.
    
    Adapted from https://stackoverflow.com/q/616645
    """
    def __init__(self, filename, mode="a", buffer=10):
        self.stdout = sys.stdout
        self.file = open(filename, mode, buffer)
        sys.stdout = self

    def __del__(self):
        self.close()

    def __enter__(self):
        pass

    def __exit__(self, *args):
        self.close()

    def write(self, message):
        self.stdout.write(message)
        self.file.write(message)

    def flush(self):
        self.stdout.flush()
        self.file.flush()
        os.fsync(self.file.fileno())

    def close(self):
        if self.stdout is not None:
            sys.stdout = self.stdout
            self.stdout = None

        if self.file is not None:
            self.file.close()

In [3]:
input_folder = config.INPUT_DIR

def load_binary(mode="train"):
    if mode == "train":
        input_file = "train.raw.binary.pkl"
    elif mode == "test":
        input_file = "test1.raw.binary.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)


def load_multicount(mode="train"):
    if mode == "train":
        input_file = "train.raw.rowCount.pkl"
    elif mode == "test":
        input_file = "test1.raw.rowCount.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)

In [4]:
with pu.profiler("loading binary features"):
    bin_cols, X_bin = load_binary("train")

with pu.profiler("loading multi value count features"):
    multicount_cols, X_multicount = load_multicount("train")
    
assert X_bin.shape[1] == len(bin_cols)
assert X_multicount.shape[1] == len(multicount_cols)
assert X_bin.shape[0] == X_multicount.shape[0]
print("Binary Feature Shape: {}".format(X_bin.shape))
print("Multi-value Count Feature Shape: {}".format(X_multicount.shape))

[14:54:11] Finish loading binary features. △M: +3.26GB. △T: 10.8 seconds.
[14:54:13] Finish loading multi value count features. △M: +963.44MB. △T: 2.7 seconds.
Binary Feature Shape: (8798814, 419701)
Multi-value Count Feature Shape: (8798814, 16)


In [5]:
with pu.profiler("joining data"):
    cols = bin_cols + multicount_cols
    X = sparse.hstack((X_bin, X_multicount))
    X = sparse.csr_matrix(X)
    del X_bin
    del X_multicount
    gc.collect()
    
print("Joined Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[14:55:28] Finish joining data. △M: +4.45GB. △T: 1.2 minutes.
Joined Data Shape: (8798814, 419717)
Feature Names Count: 419717
Memory usage at this moment :8.75GB


In [6]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [7]:
# n_splits = 5
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=20180505)
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [8]:
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X.shape[0]
    
    del X
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[14:56:03] Finish splitting train/valid set. △M: +69.94MB. △T: 28.5 seconds.
Training Set Size: (5865875, 419717)
Validation Set Size: (2932939, 419717)


In [9]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols)
    gc.collect()

[14:56:22] Finish preparing LightGBM data. △M: +5.75GB. △T: 18.6 seconds.


In [10]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0505/')
log_file = 'v1.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [11]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

with DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     early_stopping_rounds=50)

[1]	train's auc: 0.644096	valid1's auc: 0.64484
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.67116	valid1's auc: 0.671451
[3]	train's auc: 0.67542	valid1's auc: 0.675298
[4]	train's auc: 0.679707	valid1's auc: 0.679858
[5]	train's auc: 0.681723	valid1's auc: 0.681894
[6]	train's auc: 0.682095	valid1's auc: 0.682318
[7]	train's auc: 0.682466	valid1's auc: 0.682524
[8]	train's auc: 0.683862	valid1's auc: 0.684055
[9]	train's auc: 0.685492	valid1's auc: 0.685772
[10]	train's auc: 0.686555	valid1's auc: 0.686728
[11]	train's auc: 0.688598	valid1's auc: 0.688655
[12]	train's auc: 0.689363	valid1's auc: 0.689375
[13]	train's auc: 0.690337	valid1's auc: 0.690238
[14]	train's auc: 0.691479	valid1's auc: 0.691443
[15]	train's auc: 0.692156	valid1's auc: 0.692085
[16]	train's auc: 0.692965	valid1's auc: 0.692781
[17]	train's auc: 0.694087	valid1's auc: 0.693848
[18]	train's auc: 0.695429	valid1's auc: 0.69512
[19]	train's auc: 0.696424	valid1's auc: 0.696009
[

[164]	train's auc: 0.765585	valid1's auc: 0.734769
[165]	train's auc: 0.765875	valid1's auc: 0.734769
[166]	train's auc: 0.766159	valid1's auc: 0.734755
[167]	train's auc: 0.766465	valid1's auc: 0.734799
[168]	train's auc: 0.766873	valid1's auc: 0.734947
[169]	train's auc: 0.767192	valid1's auc: 0.734912
[170]	train's auc: 0.767421	valid1's auc: 0.734971
[171]	train's auc: 0.767645	valid1's auc: 0.734995
[172]	train's auc: 0.767758	valid1's auc: 0.735019
[173]	train's auc: 0.768062	valid1's auc: 0.73506
[174]	train's auc: 0.768239	valid1's auc: 0.735089
[175]	train's auc: 0.768425	valid1's auc: 0.735098
[176]	train's auc: 0.768609	valid1's auc: 0.735102
[177]	train's auc: 0.768851	valid1's auc: 0.735146
[178]	train's auc: 0.769066	valid1's auc: 0.735149
[179]	train's auc: 0.769281	valid1's auc: 0.73515
[180]	train's auc: 0.769393	valid1's auc: 0.73516
[181]	train's auc: 0.769556	valid1's auc: 0.735162
[182]	train's auc: 0.769772	valid1's auc: 0.735185
[183]	train's auc: 0.770192	valid1

[326]	train's auc: 0.7935	valid1's auc: 0.736217
[327]	train's auc: 0.793682	valid1's auc: 0.736207
[328]	train's auc: 0.793751	valid1's auc: 0.736205
[329]	train's auc: 0.793943	valid1's auc: 0.736209
[330]	train's auc: 0.794081	valid1's auc: 0.736211
[331]	train's auc: 0.794266	valid1's auc: 0.736199
[332]	train's auc: 0.794432	valid1's auc: 0.736187
[333]	train's auc: 0.794533	valid1's auc: 0.736189
[334]	train's auc: 0.794645	valid1's auc: 0.736184
[335]	train's auc: 0.794783	valid1's auc: 0.736186
[336]	train's auc: 0.794921	valid1's auc: 0.736206
[337]	train's auc: 0.79498	valid1's auc: 0.736206
[338]	train's auc: 0.795041	valid1's auc: 0.736192
[339]	train's auc: 0.795147	valid1's auc: 0.736197
[340]	train's auc: 0.795276	valid1's auc: 0.736179
[341]	train's auc: 0.795393	valid1's auc: 0.736186
[342]	train's auc: 0.795511	valid1's auc: 0.736182
[343]	train's auc: 0.795643	valid1's auc: 0.736177
[344]	train's auc: 0.795739	valid1's auc: 0.736173
[345]	train's auc: 0.795829	valid1

In [12]:
log_file = 'v1.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419702,valueCount@interest1,719
419706,valueCount@interest5,641
419703,valueCount@interest2,523
419614,creativeSize_59,375
3,age_2,252
419699,productType_9,210
419610,creativeSize_22,207
11,marriageStatus_10,168
419611,creativeSize_35,160
419697,productType_4,156


In [14]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[15:56:08] Finish cleaning memory. △M: -15.22GB. △T: 3.1 seconds.


In [15]:
with pu.profiler("loading binary features"):
    bin_cols, X_bin = load_binary("test")

with pu.profiler("loading multi value count features"):
    multicount_cols, X_multicount = load_multicount("test")
    
assert X_bin.shape[1] == len(bin_cols)
assert X_multicount.shape[1] == len(multicount_cols)
assert X_bin.shape[0] == X_multicount.shape[0]
print("Binary Feature Shape: {}".format(X_bin.shape))
print("Multi-value Count Feature Shape: {}".format(X_multicount.shape))

with pu.profiler("joining data"):
    cols = bin_cols + multicount_cols
    X = sparse.hstack((X_bin, X_multicount))
    X = sparse.csr_matrix(X)
    del X_bin
    del X_multicount
    gc.collect()
    
print("Joined Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[15:56:48] Finish loading binary features. △M: +699.77MB. △T: 2.3 seconds.
[15:56:48] Finish loading multi value count features. △M: +8.0KB. △T: 0.3 seconds.
Binary Feature Shape: (2265989, 419701)
Multi-value Count Feature Shape: (2265989, 16)
[15:57:04] Finish joining data. △M: +836.12MB. △T: 16.0 seconds.
Joined Data Shape: (2265989, 419717)
Feature Names Count: 419717
Memory usage at this moment :4.44GB


In [18]:
df_test = du.load_raw_data("test")
X = X.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[15:59:15] Finish making prediction on testing set. △M: +1.23MB. △T: 1.1 minutes.


In [19]:
subm_folder = '../subm/lgbm/0505_v1'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)