In [1]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import lightgbm as lgb
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../../../code/utils')
import data_utils as du
import perf_utils as pu

In [2]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [3]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value
user_feat_names = user_one_feat_names + user_multi_feat_names

In [4]:
ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names

In [6]:
with pu.profiler("loading and joining user and ad data"):
    X, feat_names = du.quick_join(ad_user=df_train, 
                                  user_feat_names=user_one_feat_names, 
                                  ad_feat_names=ad_cat_feat_names)
gc.collect()
print("Combined Matrix Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(feat_names)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

loading user matrices: 100%|██████████| 7/7 [02:44<00:00, 23.50s/it]
loading ad matrices: 100%|██████████| 6/6 [00:22<00:00,  3.69s/it]

[06:55:19] Finish loading and joining user and ad data. △M: +1002.48MB. △T: 3.1 minutes.
Combined Matrix Shape: (8798814, 1349)
Feature Names Count: 1349
Memory usage at this moment :1.3GB





In [7]:
y = df_train['label'].values
y = (y + 1) / 2
assert X.shape[0] == y.shape[0]

In [8]:
n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [9]:
X = sparse.csr_matrix(X)

In [10]:
train_index, valid_index = split_indices[0]

X_train, y_train = X[train_index, :], y[train_index]
X_valid, y_valid = X[valid_index, :], y[valid_index]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=feat_names)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=feat_names)

In [11]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 6,
    'num_leaves': 31,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 100

lgbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=num_rounds,
                 valid_sets=[lgb_train, lgb_valid], 
                 valid_names=['train', 'valid1'],
                 early_stopping_rounds=50)

[1]	train's auc: 0.604395	valid1's auc: 0.605676
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.606304	valid1's auc: 0.607287
[3]	train's auc: 0.609834	valid1's auc: 0.61127
[4]	train's auc: 0.611244	valid1's auc: 0.612525
[5]	train's auc: 0.614005	valid1's auc: 0.615161
[6]	train's auc: 0.615834	valid1's auc: 0.617013
[7]	train's auc: 0.616394	valid1's auc: 0.617442
[8]	train's auc: 0.616818	valid1's auc: 0.618057
[9]	train's auc: 0.619163	valid1's auc: 0.620461
[10]	train's auc: 0.618998	valid1's auc: 0.620193
[11]	train's auc: 0.619531	valid1's auc: 0.62076
[12]	train's auc: 0.619286	valid1's auc: 0.620556
[13]	train's auc: 0.619629	valid1's auc: 0.620829
[14]	train's auc: 0.620364	valid1's auc: 0.621582
[15]	train's auc: 0.620877	valid1's auc: 0.622107
[16]	train's auc: 0.621327	valid1's auc: 0.622707
[17]	train's auc: 0.622288	valid1's auc: 0.623592
[18]	train's auc: 0.622655	valid1's auc: 0.623963
[19]	train's auc: 0.623423	valid1's auc: 0.62477


In [16]:
feat_importance_dict = dict(zip(feat_names, lgbm.feature_importance()))
print(feat_importance_dict)

{'LBS_117': 0, 'adCategoryId_9': 0, 'LBS_161': 0, 'LBS_657': 0, 'campaignId_174407': 2, 'LBS_735': 0, 'LBS_692': 0, 'LBS_203': 2, 'LBS_695': 0, 'LBS_636': 0, 'LBS_428': 0, 'LBS_791': 0, 'LBS_319': 1, 'LBS_118': 0, 'LBS_578': 0, 'LBS_649': 0, 'campaignId_159118': 0, 'campaignId_4833': 2, 'advertiserId_9106': 22, 'creativeId_43877': 0, 'LBS_125': 0, 'LBS_289': 0, 'productId_38': 0, 'LBS_201': 1, 'campaignId_59293': 0, 'LBS_897': 0, 'LBS_186': 0, 'LBS_987': 0, 'LBS_917': 0, 'LBS_955': 0, 'LBS_813': 1, 'creativeId_832650': 0, 'advertiserId_9571': 0, 'adCategoryId_81': 0, 'LBS_650': 0, 'LBS_589': 0, 'LBS_758': 0, 'creativeId_1160702': 0, 'LBS_224': 0, 'campaignId_7527': 0, 'campaignId_84178': 0, 'LBS_335': 3, 'LBS_549': 0, 'advertiserId_8203': 28, 'LBS_965': 1, 'LBS_429': 0, 'creativeId_1724522': 1, 'LBS_932': 0, 'LBS_581': 0, 'LBS_716': 0, 'LBS_114': 6, 'LBS_30': 0, 'LBS_569': 0, 'LBS_953': 0, 'productId_17614': 0, 'LBS_56': 3, 'creativeId_1147463': 14, 'LBS_792': 0, 'creativeId_585909': 0

In [17]:
feat_importance_counter = Counter(feat_importance_dict)
feat_importance_counter.most_common(1000)

[('carrier_0', 118),
 ('gender_2', 115),
 ('age_3', 108),
 ('age_2', 105),
 ('consumptionAbility_2', 91),
 ('age_1', 91),
 ('consumptionAbility_0', 90),
 ('age_5', 80),
 ('age_4', 71),
 ('education_2', 71),
 ('productType_9', 64),
 ('consumptionAbility_1', 64),
 ('gender_1', 43),
 ('adCategoryId_51', 39),
 ('campaignId_40405', 38),
 ('education_6', 35),
 ('education_7', 34),
 ('advertiserId_6946', 34),
 ('house_1', 31),
 ('advertiserId_8203', 28),
 ('advertiserId_370', 28),
 ('LBS_258', 26),
 ('advertiserId_17597', 26),
 ('adCategoryId_67', 25),
 ('advertiserId_9106', 22),
 ('advertiserId_8864', 22),
 ('productId_113', 22),
 ('adCategoryId_24', 21),
 ('campaignId_5616', 20),
 ('carrier_1', 20),
 ('education_1', 20),
 ('productType_11', 20),
 ('creativeId_1610899', 19),
 ('advertiserId_25485', 17),
 ('advertiserId_83042', 17),
 ('creativeId_27461', 17),
 ('productType_4', 16),
 ('advertiserId_158679', 16),
 ('advertiserId_8350', 16),
 ('advertiserId_11195', 15),
 ('creativeId_1147463', 