In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
import data_utils as du
import perf_utils as pu

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [3]:
ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names

In [4]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value
user_feat_names = user_one_feat_names + user_multi_feat_names

In [5]:
with pu.profiler("loading and joining user and ad data"):
    X, feat_names = du.quick_join(ad_user=df_train, 
                                  user_feat_names=user_feat_names, 
                                  ad_feat_names=ad_feat_names)
del feat_names
gc.collect()
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

loading user matrices: 100%|██████████| 23/23 [12:04<00:00, 31.51s/it]
loading ad matrices: 100%|██████████| 8/8 [01:53<00:00, 14.20s/it]

[07:05:12] Finish loading and joining user and ad data. △M: +5.89GB. △T: 14.0 minutes.
Memory usage at this moment :6.19GB





In [6]:
X = sparse.csr_matrix(X)
print("Combined Matrix Shape: {}".format(X.shape))
gc.collect()

Combined Matrix Shape: (8798814, 419874)


0

In [7]:
y = df_train['label'].values
y = (y + 1) / 2
assert X.shape[0] == y.shape[0]

In [8]:
n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [9]:
train_index, valid_index = split_indices[0]

X_train, y_train = X[train_index, :], y[train_index]
X_valid, y_valid = X[valid_index, :], y[valid_index]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

lgbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=num_rounds,
                 valid_sets=[lgb_train, lgb_valid], 
                 valid_names=['train', 'valid1'],
                 early_stopping_rounds=50)

[1]	train's auc: 0.667081	valid1's auc: 0.667503
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.673297	valid1's auc: 0.673049
[3]	train's auc: 0.674735	valid1's auc: 0.674414
[4]	train's auc: 0.675928	valid1's auc: 0.675818
[5]	train's auc: 0.677962	valid1's auc: 0.677781
[6]	train's auc: 0.679204	valid1's auc: 0.679102
[7]	train's auc: 0.68102	valid1's auc: 0.680923
[8]	train's auc: 0.684798	valid1's auc: 0.684689
[9]	train's auc: 0.68489	valid1's auc: 0.684822
[10]	train's auc: 0.685693	valid1's auc: 0.685343
[11]	train's auc: 0.687817	valid1's auc: 0.687525
[12]	train's auc: 0.688626	valid1's auc: 0.688311
[13]	train's auc: 0.688999	valid1's auc: 0.688591
[14]	train's auc: 0.690043	valid1's auc: 0.68957
[15]	train's auc: 0.690701	valid1's auc: 0.690284
[16]	train's auc: 0.691926	valid1's auc: 0.691485
[17]	train's auc: 0.69298	valid1's auc: 0.692425
[18]	train's auc: 0.694334	valid1's auc: 0.693883
[19]	train's auc: 0.696208	valid1's auc: 0.69572
[2

[164]	train's auc: 0.764497	valid1's auc: 0.734167
[165]	train's auc: 0.764654	valid1's auc: 0.734173
[166]	train's auc: 0.764832	valid1's auc: 0.734187
[167]	train's auc: 0.765003	valid1's auc: 0.734175
[168]	train's auc: 0.76522	valid1's auc: 0.734185
[169]	train's auc: 0.765449	valid1's auc: 0.734207
[170]	train's auc: 0.7656	valid1's auc: 0.734246
[171]	train's auc: 0.765785	valid1's auc: 0.734273
[172]	train's auc: 0.765951	valid1's auc: 0.734285
[173]	train's auc: 0.766137	valid1's auc: 0.734291
[174]	train's auc: 0.766214	valid1's auc: 0.734295
[175]	train's auc: 0.766474	valid1's auc: 0.734311
[176]	train's auc: 0.766751	valid1's auc: 0.734342
[177]	train's auc: 0.767003	valid1's auc: 0.734343
[178]	train's auc: 0.767192	valid1's auc: 0.734344
[179]	train's auc: 0.76741	valid1's auc: 0.734346
[180]	train's auc: 0.76765	valid1's auc: 0.73435
[181]	train's auc: 0.767889	valid1's auc: 0.734379
[182]	train's auc: 0.768057	valid1's auc: 0.734405
[183]	train's auc: 0.768311	valid1's 

[326]	train's auc: 0.792435	valid1's auc: 0.736219
[327]	train's auc: 0.792518	valid1's auc: 0.736226
[328]	train's auc: 0.792651	valid1's auc: 0.736224
[329]	train's auc: 0.792776	valid1's auc: 0.736222
[330]	train's auc: 0.792979	valid1's auc: 0.736233
[331]	train's auc: 0.793104	valid1's auc: 0.736228
[332]	train's auc: 0.79322	valid1's auc: 0.736241
[333]	train's auc: 0.793333	valid1's auc: 0.736225
[334]	train's auc: 0.79343	valid1's auc: 0.736207
[335]	train's auc: 0.793466	valid1's auc: 0.736204
[336]	train's auc: 0.793618	valid1's auc: 0.7362
[337]	train's auc: 0.79373	valid1's auc: 0.736204
[338]	train's auc: 0.79381	valid1's auc: 0.736184
[339]	train's auc: 0.793934	valid1's auc: 0.73619
[340]	train's auc: 0.794053	valid1's auc: 0.736177
[341]	train's auc: 0.794203	valid1's auc: 0.736139
[342]	train's auc: 0.794316	valid1's auc: 0.736135
[343]	train's auc: 0.79444	valid1's auc: 0.736151
[344]	train's auc: 0.794551	valid1's auc: 0.736134
[345]	train's auc: 0.794638	valid1's au

In [11]:
leaves_valid = lgbm.predict(X_valid.astype(np.float32), pred_leaf=True)

In [12]:
with pu.profiler("transforming dense leaves to sparse matrix"):
    n_leaves = 120
    n_records, n_rounds = leaves_valid.shape
    indices = np.arange(0, n_rounds * n_leaves, n_leaves) + leaves_valid
    
    del leaves_valid
    gc.collect()
    
    data = np.ones(n_records * n_rounds).astype(np.int8)
    indices = indices.flatten()
    indptr = np.arange(0, (n_records + 1) * n_rounds, n_rounds)
    leaves_sparse_matrix = sparse.csr_matrix((data, indices, indptr), dtype=np.int8)
    del data
    del indices
    del indptr
    gc.collect()

[10:22:30] Finish transforming dense leaves to sparse matrix. △M: +895.07MB. △T: 36.5 seconds.


In [13]:
n_splits2 = 3  # use 3 instead of 5 to save time
skf2 = StratifiedKFold(n_splits=n_splits2)
split_indices2 = [(train_index2, valid_index2) for train_index2, valid_index2 in skf2.split(leaves_sparse_matrix, y_valid)]
aucs = np.zeros(n_splits2)

for i, (train_index2, valid_index2) in enumerate(split_indices2):
    X_vtr, y_vtr = leaves_sparse_matrix[train_index2, :], y_valid[train_index2]
    X_vv, y_vv = leaves_sparse_matrix[valid_index2, :], y_valid[valid_index2]
    # X_vtr, y_vtr = leaves_sparse_matrix[valid_index2, :], y_valid[valid_index2]  # for debug use
    # X_vv, y_vv = leaves_sparse_matrix[train_index2, :], y_valid[train_index2]
    
    print("[{}] start training fold {}/{}".format(pu.get_time_str(), i + 1, n_splits2))
    lr = LogisticRegression(solver='sag')
    lr.fit(X_vtr, y_vtr)
    proba_vv = lr.predict_proba(X_vv)
    
    aucs[i] = metrics.roc_auc_score(y_vv, proba_vv[:, 1])
    print("[{}] AUC: {}".format(pu.get_time_str(), aucs[i]))
    
print("AUC: {:.6f}(+/-{:.6f})".format(aucs.mean(), aucs.std()))

[10:23:06] start training fold 1/3




[11:14:12] AUC: 0.7315755465005951
[11:14:44] start training fold 2/3
[11:27:53] AUC: 0.7335304257180298
[11:28:16] start training fold 3/3
[11:40:01] AUC: 0.7339445326761025
AUC: 0.733017(+/-0.001033)
