In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_utils as du
import perf_utils as pu
import io_utils as iu
import config

In [2]:
input_folder = config.INPUT_DIR

def load_binary(mode="train"):
    if mode == "train":
        input_file = "train.raw.binary.pkl"
    elif mode == "test":
        input_file = "test1.raw.binary.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)


def load_multicount(mode="train"):
    if mode == "train":
        input_file = "train.raw.rowCount.pkl"
    elif mode == "test":
        input_file = "test1.raw.rowCount.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)


def load_stacking_bs_clickrate(mode="train"):
    if mode == "train":
        input_file = "train.cross.clickStats_v1.pkl"
    elif mode == "test":
        input_file = "test1.cross.clickStats_v1.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)

In [3]:
with pu.profiler("loading binary features"):
    bin_cols, X_bin = load_binary("train")

with pu.profiler("loading multi value count features"):
    ckr_cols, X_ckr = load_stacking_bs_clickrate("train")
    
assert X_bin.shape[1] == len(bin_cols)
assert X_ckr.shape[1] == len(ckr_cols)
assert X_bin.shape[0] == X_ckr.shape[0]
print("Binary Feature Shape: {}".format(X_bin.shape))
print("Cross Clicks Feature Shape: {}".format(X_ckr.shape))

[15:57:15] Finish loading binary features. △M: +3.26GB. △T: 9.8 seconds.
[15:57:15] Finish loading multi value count features. △M: +268.54MB. △T: 0.8 seconds.
Binary Feature Shape: (8798814, 419701)
Cross Clicks Feature Shape: (8798814, 8)


In [4]:
with pu.profiler("joining data"):
    cols = bin_cols + ckr_cols
    X = sparse.hstack((X_bin, X_ckr))
    X = sparse.csr_matrix(X)
    del X_bin
    del X_ckr
    gc.collect()
    
print("Joined Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[16:00:11] Finish joining data. △M: +2.19GB. △T: 1.2 minutes.
Joined Data Shape: (8798814, 419709)
Feature Names Count: 419709
Memory usage at this moment :5.81GB


In [5]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [6]:
n_splits = 3
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)
split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

# n_splits = 3  # use 3 instead of 5 to save time
# skf = StratifiedKFold(n_splits=n_splits)
# split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [7]:
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X.shape[0]
    
    del X
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[16:04:09] Finish splitting train/valid set. △M: +79.91MB. △T: 48.4 seconds.
Training Set Size: (5865876, 419709)
Validation Set Size: (2932938, 419709)


In [8]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols)
    gc.collect()

[16:04:17] Finish preparing LightGBM data. △M: +5.65GB. △T: 8.0 seconds.


In [9]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0505/')
log_file = 'v2.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     early_stopping_rounds=50)

[1]	train's auc: 0.664869	valid1's auc: 0.663328
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.684749	valid1's auc: 0.683192
[3]	train's auc: 0.686767	valid1's auc: 0.68536
[4]	train's auc: 0.687861	valid1's auc: 0.686481
[5]	train's auc: 0.688993	valid1's auc: 0.687545
[6]	train's auc: 0.690176	valid1's auc: 0.688859
[7]	train's auc: 0.691042	valid1's auc: 0.689639
[8]	train's auc: 0.693546	valid1's auc: 0.691963
[9]	train's auc: 0.694649	valid1's auc: 0.69309
[10]	train's auc: 0.695311	valid1's auc: 0.693668
[11]	train's auc: 0.697143	valid1's auc: 0.695521
[12]	train's auc: 0.698508	valid1's auc: 0.696729
[13]	train's auc: 0.698889	valid1's auc: 0.696939
[14]	train's auc: 0.701022	valid1's auc: 0.698899
[15]	train's auc: 0.702004	valid1's auc: 0.699779
[16]	train's auc: 0.702665	valid1's auc: 0.700349
[17]	train's auc: 0.703336	valid1's auc: 0.700943
[18]	train's auc: 0.704249	valid1's auc: 0.701715
[19]	train's auc: 0.70549	valid1's auc: 0.702823


[164]	train's auc: 0.768367	valid1's auc: 0.734771
[165]	train's auc: 0.768622	valid1's auc: 0.734768
[166]	train's auc: 0.769141	valid1's auc: 0.73508
[167]	train's auc: 0.769274	valid1's auc: 0.735092
[168]	train's auc: 0.769514	valid1's auc: 0.735082
[169]	train's auc: 0.769854	valid1's auc: 0.735141
[170]	train's auc: 0.770032	valid1's auc: 0.735159
[171]	train's auc: 0.770162	valid1's auc: 0.735189
[172]	train's auc: 0.77027	valid1's auc: 0.735196
[173]	train's auc: 0.770401	valid1's auc: 0.735208
[174]	train's auc: 0.770581	valid1's auc: 0.73521
[175]	train's auc: 0.770794	valid1's auc: 0.735225
[176]	train's auc: 0.770999	valid1's auc: 0.735229
[177]	train's auc: 0.771186	valid1's auc: 0.735239
[178]	train's auc: 0.771463	valid1's auc: 0.735232
[179]	train's auc: 0.771721	valid1's auc: 0.735213
[180]	train's auc: 0.771926	valid1's auc: 0.735213
[181]	train's auc: 0.772241	valid1's auc: 0.735292
[182]	train's auc: 0.772515	valid1's auc: 0.735307
[183]	train's auc: 0.772692	valid1

[326]	train's auc: 0.796688	valid1's auc: 0.736081
[327]	train's auc: 0.796792	valid1's auc: 0.736067
[328]	train's auc: 0.796914	valid1's auc: 0.736072
[329]	train's auc: 0.797083	valid1's auc: 0.736072
[330]	train's auc: 0.797163	valid1's auc: 0.736061
[331]	train's auc: 0.797273	valid1's auc: 0.736047
[332]	train's auc: 0.797413	valid1's auc: 0.73607
[333]	train's auc: 0.7976	valid1's auc: 0.736076
[334]	train's auc: 0.797773	valid1's auc: 0.736188
[335]	train's auc: 0.797988	valid1's auc: 0.736194
[336]	train's auc: 0.798136	valid1's auc: 0.736206
[337]	train's auc: 0.798317	valid1's auc: 0.73619
[338]	train's auc: 0.798529	valid1's auc: 0.736184
[339]	train's auc: 0.798577	valid1's auc: 0.736178
[340]	train's auc: 0.798727	valid1's auc: 0.736171
[341]	train's auc: 0.798849	valid1's auc: 0.736145
[342]	train's auc: 0.798947	valid1's auc: 0.736146
[343]	train's auc: 0.799074	valid1's auc: 0.736179
[344]	train's auc: 0.799214	valid1's auc: 0.736158
[345]	train's auc: 0.799445	valid1'

In [11]:
log_file = 'v2.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419701,bsClickrate@aid_x_age,1301
419707,bsClickrate@aid_x_LBS,1138
419705,bsClickrate@aid_x_consumptionAbility,1111
419703,bsClickrate@aid_x_education,994
419708,impression@aid_x_LBS,858
419702,impression@aid_x_age,696
419704,impression@aid_x_education,639
419706,impression@aid_x_consumptionAbility,635
419614,creativeSize_59,287
419610,creativeSize_22,196


In [12]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[16:56:40] Finish cleaning memory. △M: -12.12GB. △T: 1.8 seconds.


In [15]:
with pu.profiler("loading binary features"):
    bin_cols, X_bin = load_binary("test")

with pu.profiler("loading multi value count features"):
    ckr_cols, X_ckr = load_stacking_bs_clickrate("test")
    
assert X_bin.shape[1] == len(bin_cols)
assert X_ckr.shape[1] == len(ckr_cols)
assert X_bin.shape[0] == X_ckr.shape[0]
print("Binary Feature Shape: {}".format(X_bin.shape))
print("Cross Clicks Feature Shape: {}".format(X_ckr.shape))

with pu.profiler("joining data"):
    cols = bin_cols + ckr_cols
    X = sparse.hstack((X_bin, X_ckr))
    X = sparse.csr_matrix(X)
    del X_bin
    del X_ckr
    gc.collect()
    
print("Joined Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[16:59:54] Finish loading binary features. △M: +676.47MB. △T: 1.1 seconds.
[16:59:54] Finish loading multi value count features. △M: +0B. △T: 0.0 seconds.
Binary Feature Shape: (2265989, 419701)
Cross Clicks Feature Shape: (2265989, 8)
[17:00:02] Finish joining data. △M: +814.61MB. △T: 8.4 seconds.
Joined Data Shape: (2265989, 419709)
Feature Names Count: 419709
Memory usage at this moment :4.51GB


In [16]:
df_test = du.load_raw_data("test")
X = X.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[17:00:44] Finish making prediction on testing set. △M: +16.0KB. △T: 32.0 seconds.


In [17]:
subm_folder = '../../../subm/lgbm/0505_v2'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)