In [1]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from contextlib import redirect_stdout
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/')
import data_utils as du
import perf_utils as pu
import io_utils as iu
import config

In [2]:
input_folder = config.INPUT_DIR

def load_binary(mode="train"):
    if mode == "train":
        input_file = "train.raw.binary.pkl"
    elif mode == "test":
        input_file = "test1.raw.binary.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)


def load_multicount(mode="train"):
    if mode == "train":
        input_file = "train.raw.rowCount.pkl"
    elif mode == "test":
        input_file = "test1.raw.rowCount.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)


def load_stacking_bs_clickrate(mode="train"):
    if mode == "train":
        input_file = "train.cross.clickStats_v1.pkl"
    elif mode == "test":
        input_file = "test1.cross.clickStats_v1.pkl"
    else:
        raise ValueError("Unknown mode: {}".format(mode))
    input_path = os.path.join(input_folder, input_file)
    return du.load_pickle(input_path)

In [3]:
with pu.profiler("joining data"):
    cols, X = load_binary("train")
    gc.collect()
    
print("Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[17:12:10] Finish joining data. △M: +3.26GB. △T: 6.2 seconds.
Data Shape: (8798814, 419701)
Feature Names Count: 419701
Memory usage at this moment :3.36GB


In [4]:
df_train = du.load_raw_data("train")
y = df_train['label'].values.copy()
y = (y + 1) / 2  # -1, 1 -> 0, 1

In [5]:
# n_splits = 3
# sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1 / 3, random_state=20180505)
# split_indices = [(train_index, valid_index) for train_index, valid_index in sss.split(df_train, y)]

n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [6]:
with pu.profiler("splitting train/valid set"):
    train_index, valid_index = split_indices[0]
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    assert X_train.shape[0] + X_valid.shape[0] == X.shape[0]
    
    del X
    gc.collect()
    
print("Training Set Size: {}".format(X_train.shape))
print("Validation Set Size: {}".format(X_valid.shape))

[17:12:33] Finish splitting train/valid set. △M: +92.32MB. △T: 18.3 seconds.
Training Set Size: (5865875, 419701)
Validation Set Size: (2932939, 419701)


In [7]:
with pu.profiler("preparing LightGBM data"):
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols)
    gc.collect()

[17:12:42] Finish preparing LightGBM data. △M: +5.12GB. △T: 9.2 seconds.


In [8]:
log_folder = os.path.join(config.LOG_DIR, 'lgbm/pipeline/0505/')
log_file = 'v3.log'
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [9]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 64,
    'learning_rate': 0.1,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'verbose': 0
}
num_rounds = 2000

with iu.DuplicatedLogger(log_path):
    lgbm = lgb.train(params,
                     lgb_train,
                     num_boost_round=num_rounds,
                     valid_sets=[lgb_train, lgb_valid], 
                     valid_names=['train', 'valid1'],
                     early_stopping_rounds=50)

[1]	train's auc: 0.607895	valid1's auc: 0.608426
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.642385	valid1's auc: 0.642872
[3]	train's auc: 0.646879	valid1's auc: 0.64753
[4]	train's auc: 0.649538	valid1's auc: 0.65037
[5]	train's auc: 0.650952	valid1's auc: 0.651717
[6]	train's auc: 0.651994	valid1's auc: 0.652752
[7]	train's auc: 0.65309	valid1's auc: 0.653876
[8]	train's auc: 0.654381	valid1's auc: 0.655069
[9]	train's auc: 0.655236	valid1's auc: 0.656008
[10]	train's auc: 0.656604	valid1's auc: 0.657342
[11]	train's auc: 0.657416	valid1's auc: 0.658076
[12]	train's auc: 0.660376	valid1's auc: 0.660832
[13]	train's auc: 0.659119	valid1's auc: 0.659554
[14]	train's auc: 0.658681	valid1's auc: 0.659167
[15]	train's auc: 0.660216	valid1's auc: 0.660685
[16]	train's auc: 0.661736	valid1's auc: 0.662045
[17]	train's auc: 0.662006	valid1's auc: 0.662229
[18]	train's auc: 0.662962	valid1's auc: 0.663184
[19]	train's auc: 0.662775	valid1's auc: 0.663059


[164]	train's auc: 0.720395	valid1's auc: 0.714795
[165]	train's auc: 0.720446	valid1's auc: 0.714824
[166]	train's auc: 0.720567	valid1's auc: 0.714945
[167]	train's auc: 0.720702	valid1's auc: 0.715008
[168]	train's auc: 0.720768	valid1's auc: 0.715051
[169]	train's auc: 0.720814	valid1's auc: 0.715085
[170]	train's auc: 0.721015	valid1's auc: 0.715247
[171]	train's auc: 0.721085	valid1's auc: 0.715271
[172]	train's auc: 0.721171	valid1's auc: 0.715318
[173]	train's auc: 0.721251	valid1's auc: 0.715356
[174]	train's auc: 0.721362	valid1's auc: 0.715457
[175]	train's auc: 0.721417	valid1's auc: 0.7155
[176]	train's auc: 0.72152	valid1's auc: 0.715578
[177]	train's auc: 0.721627	valid1's auc: 0.715652
[178]	train's auc: 0.721717	valid1's auc: 0.715722
[179]	train's auc: 0.721795	valid1's auc: 0.715767
[180]	train's auc: 0.721833	valid1's auc: 0.715783
[181]	train's auc: 0.721973	valid1's auc: 0.715914
[182]	train's auc: 0.722127	valid1's auc: 0.716047
[183]	train's auc: 0.722242	valid1

[326]	train's auc: 0.734745	valid1's auc: 0.724092
[327]	train's auc: 0.734794	valid1's auc: 0.724116
[328]	train's auc: 0.734884	valid1's auc: 0.724143
[329]	train's auc: 0.734944	valid1's auc: 0.724188
[330]	train's auc: 0.734962	valid1's auc: 0.724194
[331]	train's auc: 0.735028	valid1's auc: 0.724222
[332]	train's auc: 0.735068	valid1's auc: 0.724241
[333]	train's auc: 0.735159	valid1's auc: 0.724298
[334]	train's auc: 0.735222	valid1's auc: 0.724338
[335]	train's auc: 0.735258	valid1's auc: 0.724354
[336]	train's auc: 0.735356	valid1's auc: 0.724377
[337]	train's auc: 0.735423	valid1's auc: 0.724392
[338]	train's auc: 0.735485	valid1's auc: 0.724421
[339]	train's auc: 0.735531	valid1's auc: 0.72444
[340]	train's auc: 0.735562	valid1's auc: 0.724454
[341]	train's auc: 0.735584	valid1's auc: 0.724459
[342]	train's auc: 0.735633	valid1's auc: 0.724471
[343]	train's auc: 0.73567	valid1's auc: 0.724488
[344]	train's auc: 0.735734	valid1's auc: 0.724519
[345]	train's auc: 0.735836	valid

[488]	train's auc: 0.743957	valid1's auc: 0.728282
[489]	train's auc: 0.744064	valid1's auc: 0.728397
[490]	train's auc: 0.744138	valid1's auc: 0.728431
[491]	train's auc: 0.744193	valid1's auc: 0.72844
[492]	train's auc: 0.744219	valid1's auc: 0.728465
[493]	train's auc: 0.744253	valid1's auc: 0.72848
[494]	train's auc: 0.744277	valid1's auc: 0.72849
[495]	train's auc: 0.74431	valid1's auc: 0.728505
[496]	train's auc: 0.744333	valid1's auc: 0.728512
[497]	train's auc: 0.74435	valid1's auc: 0.728513
[498]	train's auc: 0.744422	valid1's auc: 0.72855
[499]	train's auc: 0.744449	valid1's auc: 0.728555
[500]	train's auc: 0.74449	valid1's auc: 0.728583
[501]	train's auc: 0.744507	valid1's auc: 0.728592
[502]	train's auc: 0.744541	valid1's auc: 0.728601
[503]	train's auc: 0.744553	valid1's auc: 0.728607
[504]	train's auc: 0.744595	valid1's auc: 0.728611
[505]	train's auc: 0.74463	valid1's auc: 0.728619
[506]	train's auc: 0.744717	valid1's auc: 0.728695
[507]	train's auc: 0.744765	valid1's au

[650]	train's auc: 0.750839	valid1's auc: 0.731052
[651]	train's auc: 0.75086	valid1's auc: 0.731056
[652]	train's auc: 0.750884	valid1's auc: 0.731059
[653]	train's auc: 0.750944	valid1's auc: 0.731073
[654]	train's auc: 0.750971	valid1's auc: 0.731086
[655]	train's auc: 0.751018	valid1's auc: 0.731097
[656]	train's auc: 0.751038	valid1's auc: 0.731102
[657]	train's auc: 0.751063	valid1's auc: 0.731112
[658]	train's auc: 0.75108	valid1's auc: 0.73111
[659]	train's auc: 0.751109	valid1's auc: 0.731124
[660]	train's auc: 0.751133	valid1's auc: 0.73113
[661]	train's auc: 0.75118	valid1's auc: 0.731145
[662]	train's auc: 0.75121	valid1's auc: 0.731153
[663]	train's auc: 0.751241	valid1's auc: 0.731155
[664]	train's auc: 0.751302	valid1's auc: 0.731165
[665]	train's auc: 0.751354	valid1's auc: 0.73119
[666]	train's auc: 0.751373	valid1's auc: 0.731191
[667]	train's auc: 0.751396	valid1's auc: 0.731196
[668]	train's auc: 0.751458	valid1's auc: 0.731205
[669]	train's auc: 0.751483	valid1's a

[812]	train's auc: 0.755807	valid1's auc: 0.732205
[813]	train's auc: 0.755838	valid1's auc: 0.732211
[814]	train's auc: 0.755869	valid1's auc: 0.732205
[815]	train's auc: 0.755924	valid1's auc: 0.732221
[816]	train's auc: 0.755955	valid1's auc: 0.732219
[817]	train's auc: 0.755984	valid1's auc: 0.732214
[818]	train's auc: 0.756003	valid1's auc: 0.732215
[819]	train's auc: 0.756033	valid1's auc: 0.732222
[820]	train's auc: 0.756056	valid1's auc: 0.732223
[821]	train's auc: 0.7561	valid1's auc: 0.732241
[822]	train's auc: 0.756135	valid1's auc: 0.73224
[823]	train's auc: 0.756147	valid1's auc: 0.732238
[824]	train's auc: 0.75617	valid1's auc: 0.732243
[825]	train's auc: 0.756198	valid1's auc: 0.732247
[826]	train's auc: 0.756213	valid1's auc: 0.732243
[827]	train's auc: 0.756236	valid1's auc: 0.73225
[828]	train's auc: 0.756256	valid1's auc: 0.732256
[829]	train's auc: 0.756299	valid1's auc: 0.732276
[830]	train's auc: 0.756324	valid1's auc: 0.732276
[831]	train's auc: 0.756421	valid1's

[974]	train's auc: 0.760351	valid1's auc: 0.733171
[975]	train's auc: 0.760389	valid1's auc: 0.73319
[976]	train's auc: 0.76042	valid1's auc: 0.733192
[977]	train's auc: 0.760441	valid1's auc: 0.733194
[978]	train's auc: 0.760455	valid1's auc: 0.733201
[979]	train's auc: 0.76049	valid1's auc: 0.733204
[980]	train's auc: 0.760506	valid1's auc: 0.733201
[981]	train's auc: 0.760529	valid1's auc: 0.733202
[982]	train's auc: 0.760537	valid1's auc: 0.7332
[983]	train's auc: 0.760541	valid1's auc: 0.733203
[984]	train's auc: 0.760555	valid1's auc: 0.733211
[985]	train's auc: 0.76057	valid1's auc: 0.733208
[986]	train's auc: 0.760596	valid1's auc: 0.733215
[987]	train's auc: 0.760622	valid1's auc: 0.73322
[988]	train's auc: 0.760645	valid1's auc: 0.733225
[989]	train's auc: 0.760688	valid1's auc: 0.733225
[990]	train's auc: 0.760708	valid1's auc: 0.733227
[991]	train's auc: 0.760728	valid1's auc: 0.733231
[992]	train's auc: 0.760752	valid1's auc: 0.733233
[993]	train's auc: 0.760787	valid1's a

[1133]	train's auc: 0.764537	valid1's auc: 0.734002
[1134]	train's auc: 0.764555	valid1's auc: 0.734003
[1135]	train's auc: 0.764567	valid1's auc: 0.734
[1136]	train's auc: 0.76459	valid1's auc: 0.733994
[1137]	train's auc: 0.764621	valid1's auc: 0.733998
[1138]	train's auc: 0.764645	valid1's auc: 0.734001
[1139]	train's auc: 0.764663	valid1's auc: 0.733995
[1140]	train's auc: 0.764684	valid1's auc: 0.733996
[1141]	train's auc: 0.764704	valid1's auc: 0.733996
[1142]	train's auc: 0.764735	valid1's auc: 0.733997
[1143]	train's auc: 0.764746	valid1's auc: 0.733995
[1144]	train's auc: 0.764775	valid1's auc: 0.734
[1145]	train's auc: 0.764791	valid1's auc: 0.734006
[1146]	train's auc: 0.764835	valid1's auc: 0.734005
[1147]	train's auc: 0.764855	valid1's auc: 0.734007
[1148]	train's auc: 0.764883	valid1's auc: 0.734007
[1149]	train's auc: 0.764906	valid1's auc: 0.734006
[1150]	train's auc: 0.764934	valid1's auc: 0.734008
[1151]	train's auc: 0.764951	valid1's auc: 0.734009
[1152]	train's auc:

[1292]	train's auc: 0.767994	valid1's auc: 0.734364
[1293]	train's auc: 0.768022	valid1's auc: 0.734368
[1294]	train's auc: 0.768039	valid1's auc: 0.734373
[1295]	train's auc: 0.768071	valid1's auc: 0.734387
[1296]	train's auc: 0.768088	valid1's auc: 0.734386
[1297]	train's auc: 0.768117	valid1's auc: 0.73439
[1298]	train's auc: 0.768135	valid1's auc: 0.734389
[1299]	train's auc: 0.768153	valid1's auc: 0.734388
[1300]	train's auc: 0.768159	valid1's auc: 0.734387
[1301]	train's auc: 0.768175	valid1's auc: 0.73439
[1302]	train's auc: 0.768197	valid1's auc: 0.734384
[1303]	train's auc: 0.768248	valid1's auc: 0.734409
[1304]	train's auc: 0.768272	valid1's auc: 0.734403
[1305]	train's auc: 0.768285	valid1's auc: 0.734408
[1306]	train's auc: 0.768304	valid1's auc: 0.734411
[1307]	train's auc: 0.768317	valid1's auc: 0.734416
[1308]	train's auc: 0.768344	valid1's auc: 0.734415
[1309]	train's auc: 0.768506	valid1's auc: 0.734536
[1310]	train's auc: 0.768528	valid1's auc: 0.734536
[1311]	train's

[1450]	train's auc: 0.771428	valid1's auc: 0.7348
[1451]	train's auc: 0.771449	valid1's auc: 0.734809
[1452]	train's auc: 0.771498	valid1's auc: 0.734805
[1453]	train's auc: 0.771521	valid1's auc: 0.734809
[1454]	train's auc: 0.771577	valid1's auc: 0.734829
[1455]	train's auc: 0.77159	valid1's auc: 0.73483
[1456]	train's auc: 0.771607	valid1's auc: 0.734833
[1457]	train's auc: 0.771625	valid1's auc: 0.734833
[1458]	train's auc: 0.771632	valid1's auc: 0.734831
[1459]	train's auc: 0.771641	valid1's auc: 0.73483
[1460]	train's auc: 0.771661	valid1's auc: 0.734828
[1461]	train's auc: 0.771681	valid1's auc: 0.734827
[1462]	train's auc: 0.77169	valid1's auc: 0.734826
[1463]	train's auc: 0.771698	valid1's auc: 0.734825
[1464]	train's auc: 0.771721	valid1's auc: 0.734829
[1465]	train's auc: 0.771734	valid1's auc: 0.73483
[1466]	train's auc: 0.771758	valid1's auc: 0.734834
[1467]	train's auc: 0.77178	valid1's auc: 0.734829
[1468]	train's auc: 0.771798	valid1's auc: 0.734834
[1469]	train's auc: 

[1609]	train's auc: 0.774779	valid1's auc: 0.735379
[1610]	train's auc: 0.774788	valid1's auc: 0.735376
[1611]	train's auc: 0.77481	valid1's auc: 0.735377
[1612]	train's auc: 0.774826	valid1's auc: 0.735372
[1613]	train's auc: 0.774845	valid1's auc: 0.735372
[1614]	train's auc: 0.77487	valid1's auc: 0.735376
[1615]	train's auc: 0.774903	valid1's auc: 0.735375
[1616]	train's auc: 0.774915	valid1's auc: 0.735378
[1617]	train's auc: 0.774934	valid1's auc: 0.735378
[1618]	train's auc: 0.774957	valid1's auc: 0.735375
[1619]	train's auc: 0.774975	valid1's auc: 0.735373
[1620]	train's auc: 0.774993	valid1's auc: 0.73537
[1621]	train's auc: 0.774999	valid1's auc: 0.735371
[1622]	train's auc: 0.775011	valid1's auc: 0.73537
[1623]	train's auc: 0.775022	valid1's auc: 0.735371
[1624]	train's auc: 0.775028	valid1's auc: 0.735372
[1625]	train's auc: 0.775039	valid1's auc: 0.73537
[1626]	train's auc: 0.775063	valid1's auc: 0.735369
[1627]	train's auc: 0.775082	valid1's auc: 0.735368
[1628]	train's au

[1768]	train's auc: 0.777906	valid1's auc: 0.735979
[1769]	train's auc: 0.777933	valid1's auc: 0.735986
[1770]	train's auc: 0.778033	valid1's auc: 0.73605
[1771]	train's auc: 0.77805	valid1's auc: 0.736055
[1772]	train's auc: 0.778066	valid1's auc: 0.736058
[1773]	train's auc: 0.778086	valid1's auc: 0.736077
[1774]	train's auc: 0.778098	valid1's auc: 0.73608
[1775]	train's auc: 0.77812	valid1's auc: 0.736078
[1776]	train's auc: 0.778148	valid1's auc: 0.736076
[1777]	train's auc: 0.778197	valid1's auc: 0.736081
[1778]	train's auc: 0.778208	valid1's auc: 0.736084
[1779]	train's auc: 0.778223	valid1's auc: 0.736087
[1780]	train's auc: 0.778269	valid1's auc: 0.736109
[1781]	train's auc: 0.778293	valid1's auc: 0.736106
[1782]	train's auc: 0.778308	valid1's auc: 0.736103
[1783]	train's auc: 0.778313	valid1's auc: 0.736105
[1784]	train's auc: 0.778322	valid1's auc: 0.736105
[1785]	train's auc: 0.778335	valid1's auc: 0.736116
[1786]	train's auc: 0.778385	valid1's auc: 0.73613
[1787]	train's au

In [10]:
log_file = 'v3.feature_importance.csv'
log_path = os.path.join(log_folder, log_file)

df_feature_importance = pd.DataFrame({"feature": cols, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419614,creativeSize_59,511
3,age_2,416
11,marriageStatus_10,329
419207,ct_1,326
1057,interest2_54,320
419610,creativeSize_22,317
419611,creativeSize_35,317
915,interest1_49,302
419697,productType_4,289
1002,interest1_76,289


In [11]:
with pu.profiler("cleaning memory"):
    del lgb_train
    del lgb_valid
    del X_train
    del X_valid
    gc.collect()

[01:05:46] Finish cleaning memory. △M: -8.79GB. △T: 1.4 seconds.


In [12]:
with pu.profiler("loading binary features"):
    cols, X = load_binary("test")

assert X.shape[1] == len(cols)
    
print("Data Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[01:07:12] Finish loading binary features. △M: +698.97MB. △T: 1.0 seconds.
Data Shape: (2265989, 419701)
Feature Names Count: 419701
Memory usage at this moment :3.58GB


In [13]:
df_test = du.load_raw_data("test")
X = X.astype(np.float32)

with pu.profiler("making prediction on testing set"):
    proba_test = lgbm.predict(X)
    assert len(proba_test.shape) == 1
    assert proba_test.shape[0] == df_test.shape[0]

[01:08:38] Finish making prediction on testing set. △M: -127.52MB. △T: 1.2 minutes.


In [14]:
subm_folder = '../../../subm/lgbm/0505_v3'
subm_file = 'submission.csv'
subm_path = os.path.join(subm_folder, subm_file)
os.makedirs(subm_folder, exist_ok=True)

subm = df_test.copy()
subm["score"] = proba_test
subm.to_csv(subm_path, index=False)