In [1]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../../../code/utils')
import data_utils as du
import perf_utils as pu

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [3]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value
user_feat_names = user_one_feat_names + user_multi_feat_names

In [4]:
ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names

In [5]:
# both 'creativeId' and 'aid' are unique to advertise. so you can do nothing actually
# I just remove 'creativeId' and add 'aid' to make my intention clearer
ad_feat_names.remove("creativeId")  # every ad has a unique 'creativeId', so it's useless
ad_feat_names.append("aid")

In [6]:
with pu.profiler("loading and joining user and ad data"):
    X, feat_names = du.quick_join(ad_user=df_train, 
                                  user_feat_names=user_feat_names, 
                                  ad_feat_names=ad_feat_names)
    gc.collect()
print("Combined Matrix Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(feat_names)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

loading user matrices: 100%|██████████| 23/23 [12:34<00:00, 32.81s/it]
loading ad matrices: 100%|██████████| 7/7 [01:44<00:00, 14.89s/it]

[00:59:49] Finish loading and joining user and ad data. △M: +5.81GB. △T: 14.3 minutes.
Combined Matrix Shape: (8798814, 419701)
Feature Names Count: 419701
Memory usage at this moment :6.14GB





In [7]:
y = df_train['label'].values
y = (y + 1) / 2
assert X.shape[0] == y.shape[0]

In [8]:
n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [9]:
X = sparse.csr_matrix(X)

In [10]:
train_index, valid_index = split_indices[0]

X_train, y_train = X[train_index, :], y[train_index]
X_valid, y_valid = X[valid_index, :], y[valid_index]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=feat_names)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=feat_names)

In [11]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

lgbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=num_rounds,
                 valid_sets=[lgb_train, lgb_valid], 
                 valid_names=['train', 'valid1'],
                 early_stopping_rounds=50)

[1]	train's auc: 0.663996	valid1's auc: 0.664082
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.674166	valid1's auc: 0.674161
[3]	train's auc: 0.676528	valid1's auc: 0.676847
[4]	train's auc: 0.677957	valid1's auc: 0.678417
[5]	train's auc: 0.679264	valid1's auc: 0.679473
[6]	train's auc: 0.679408	valid1's auc: 0.679541
[7]	train's auc: 0.681295	valid1's auc: 0.681196
[8]	train's auc: 0.682295	valid1's auc: 0.682272
[9]	train's auc: 0.683165	valid1's auc: 0.683265
[10]	train's auc: 0.684418	valid1's auc: 0.684477
[11]	train's auc: 0.687867	valid1's auc: 0.68775
[12]	train's auc: 0.688314	valid1's auc: 0.688098
[13]	train's auc: 0.689342	valid1's auc: 0.689092
[14]	train's auc: 0.690479	valid1's auc: 0.690262
[15]	train's auc: 0.691191	valid1's auc: 0.690887
[16]	train's auc: 0.693681	valid1's auc: 0.693229
[17]	train's auc: 0.69461	valid1's auc: 0.694095
[18]	train's auc: 0.695607	valid1's auc: 0.695136
[19]	train's auc: 0.696289	valid1's auc: 0.695712

[164]	train's auc: 0.764822	valid1's auc: 0.733848
[165]	train's auc: 0.765036	valid1's auc: 0.733901
[166]	train's auc: 0.7652	valid1's auc: 0.733914
[167]	train's auc: 0.765391	valid1's auc: 0.733932
[168]	train's auc: 0.765541	valid1's auc: 0.733925
[169]	train's auc: 0.765809	valid1's auc: 0.733936
[170]	train's auc: 0.766018	valid1's auc: 0.733942
[171]	train's auc: 0.766189	valid1's auc: 0.733936
[172]	train's auc: 0.766477	valid1's auc: 0.733938
[173]	train's auc: 0.766636	valid1's auc: 0.733984
[174]	train's auc: 0.76675	valid1's auc: 0.73398
[175]	train's auc: 0.766898	valid1's auc: 0.733984
[176]	train's auc: 0.767059	valid1's auc: 0.733994
[177]	train's auc: 0.767331	valid1's auc: 0.734004
[178]	train's auc: 0.767476	valid1's auc: 0.733993
[179]	train's auc: 0.767723	valid1's auc: 0.734016
[180]	train's auc: 0.767903	valid1's auc: 0.734006
[181]	train's auc: 0.7681	valid1's auc: 0.734023
[182]	train's auc: 0.768337	valid1's auc: 0.734099
[183]	train's auc: 0.768478	valid1's 

[326]	train's auc: 0.792862	valid1's auc: 0.736065
[327]	train's auc: 0.793138	valid1's auc: 0.736086
[328]	train's auc: 0.793321	valid1's auc: 0.736069
[329]	train's auc: 0.793582	valid1's auc: 0.736183
[330]	train's auc: 0.793644	valid1's auc: 0.736184
[331]	train's auc: 0.793799	valid1's auc: 0.736201
[332]	train's auc: 0.793927	valid1's auc: 0.7362
[333]	train's auc: 0.794087	valid1's auc: 0.736196
[334]	train's auc: 0.794271	valid1's auc: 0.736185
[335]	train's auc: 0.794387	valid1's auc: 0.736172
[336]	train's auc: 0.794542	valid1's auc: 0.736173
[337]	train's auc: 0.794737	valid1's auc: 0.73617
[338]	train's auc: 0.794805	valid1's auc: 0.736162
[339]	train's auc: 0.794867	valid1's auc: 0.73615
[340]	train's auc: 0.79505	valid1's auc: 0.73615
[341]	train's auc: 0.79514	valid1's auc: 0.736143
[342]	train's auc: 0.795236	valid1's auc: 0.73613
[343]	train's auc: 0.795389	valid1's auc: 0.736109
[344]	train's auc: 0.795555	valid1's auc: 0.736105
[345]	train's auc: 0.795597	valid1's au

In [12]:
df_feature_importance = pd.DataFrame({"feature": feat_names, "importance": lgbm.feature_importance()})

In [13]:
log_folder = "../../../log/lgbm/feature_importance"
log_file = "0430.csv"
log_path = os.path.join(log_folder, log_file)
os.makedirs(log_folder, exist_ok=True)

In [14]:
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.to_csv(log_path, index=False)

In [15]:
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419224,creativeSize_59,360
3,age_2,256
921,interest1_49,230
419220,creativeSize_22,228
419526,productType_9,206
419221,creativeSize_35,195
884,marriageStatus_10,192
419213,ct_1,189
419524,productType_4,178
1063,interest2_54,171
