In [1]:
from sklearn.model_selection import StratifiedKFold
import scipy.sparse as sparse
import lightgbm as lgb
import pandas as pd
import numpy as np
import tqdm
import os
import gc
import sys
sys.path.append('../code/utils/')
import data_utils as du
import perf_utils as pu

In [2]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value
user_feat_names = user_one_feat_names + user_multi_feat_names

ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names


# both 'creativeId' and 'aid' are unique to advertise. so you can do nothing actually
# I just remove 'creativeId' and add 'aid' to make my intention clearer
ad_feat_names.remove("creativeId")  # every ad has a unique 'creativeId', so it's useless
ad_feat_names.append("aid")

In [3]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")
ad_user = pd.merge(df_train, df_ad, on='aid', how='left')

In [4]:
clickrate_folder = "../data/clickrate_bs/simple_cross/byUserFeatureName/"

In [5]:
def sort_ad_user_pairs(pairs):
    new_pairs = {}
    for ad_feat_name, user_feat_name in pairs:
        if user_feat_name not in new_pairs:
            new_pairs[user_feat_name] = [ad_feat_name]
        else:
            new_pairs[user_feat_name].append(ad_feat_name)
    return new_pairs

In [6]:
def join_clickrate_bs(ad_user, ad_feat_name, user_feat_name, stage='preliminary'):
    # given ad_feat_name and user_feat_name
    ad_user = ad_user.copy()  # copy instance to avoid modifying the original instance
    
    # prepare ad feature
    if ad_feat_name not in ad_user.columns:
        df_ad = du.load_raw_data("ad", stage=stage)
        ad_user = pd.merge(ad_user, df_ad, on='aid', how='left')
        del df_ad
        gc.collect()
        print(0)
        print(ad_user.head(5))
        
    # prepare user feature (original value)
    if user_feat_name not in ad_user.columns:
        df_feat = du.load_user_feature(user_feat_name)
        ad_user = pd.merge(ad_user, df_feat, on="uid", how="left")
        del df_feat
        gc.collect()
    ad_user[user_feat_name] = ad_user[user_feat_name].astype('str')   # for joining convenience
    
    # prepare click rates
    in_folder = os.path.join(clickrate_folder, "[featureName='{}']".format(user_feat_name))
    in_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
    in_path = os.path.join(in_folder, in_file)
    df_clickrate = pd.read_csv(in_path)
    df_clickrate = df_clickrate.rename(columns={"ad_val": ad_feat_name, 
                                            "user_val": user_feat_name})
    df_clickrate[user_feat_name] = df_clickrate[user_feat_name].astype(str)  # for joining convenience
    assert ad_user[user_feat_name].dtype == df_clickrate[user_feat_name].dtype
    
    # join click rate
    ad_user = pd.merge(ad_user, df_clickrate, 
                         on=[ad_feat_name, user_feat_name], how='left')
    ad_user = ad_user[['aid', 'uid', 'bs_clickrate', 'click', 'impression']]
    gc.collect()
    
    return ad_user

In [7]:
with pu.profiler("loading and joining cross statistics data"):
    pairs = [("aid", "age"), ("aid", "education"), ("aid", "consumptionAbility"), ("aid", "LBS")]
    cross_stats_cols = []
    for ad_feat_name, user_feat_name in pairs:
        clickrate_col = "bs_clickrate_{}_x_{}".format(ad_feat_name, user_feat_name)
        impression_col = "impression_{}_x_{}".format(ad_feat_name, user_feat_name)
        df_new = join_clickrate_bs(ad_user, ad_feat_name, user_feat_name)
        df_new = df_new.drop(["aid", "uid", "click"], axis=1)
        df_new = df_new.rename(columns={"bs_clickrate": clickrate_col,
                                        "impression": impression_col})
        df_new = df_new[[clickrate_col, impression_col]]
        ad_user = pd.concat([ad_user, df_new], axis=1)
        cross_stats_cols += [clickrate_col, impression_col]

        del df_new
        gc.collect()

    X_cross_stats = sparse.csr_matrix(ad_user[cross_stats_cols].values).astype(np.float32)
    del ad_user
    gc.collect()
print("Combined Matrix Shape: {}".format(X_cross_stats.shape))
print("Feature Names Count: {}".format(len(cross_stats_cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[16:41:20] Finish loading and joining cross statistics data. △M: -147.57MB. △T: 2.4 minutes.
Combined Matrix Shape: (8798814, 8)
Feature Names Count: 8
Memory usage at this moment :912.39MB


In [8]:
with pu.profiler("loading and joining user and ad data"):
    X_bin, bin_cols = du.quick_join(ad_user=df_train, 
                                    user_feat_names=user_feat_names, 
                                    ad_feat_names=ad_feat_names)
    gc.collect()
print("Combined Matrix Shape: {}".format(X_bin.shape))
print("Feature Names Count: {}".format(len(bin_cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

loading user matrices: 100%|██████████| 23/23 [17:03<00:00, 44.52s/it]
loading ad matrices: 100%|██████████| 7/7 [03:00<00:00, 25.83s/it]

[17:01:25] Finish loading and joining user and ad data. △M: +5.79GB. △T: 20.1 minutes.
Combined Matrix Shape: (8798814, 419701)
Feature Names Count: 419701
Memory usage at this moment :6.68GB





In [9]:
with pu.profiler("joining binary matrix and cross statistics matrix"):
    cols = bin_cols + cross_stats_cols
    X = sparse.hstack((X_bin, X_cross_stats))
    del X_bin
    del X_cross_stats
    gc.collect()
print("Combined Matrix Shape: {}".format(X.shape))
print("Feature Names Count: {}".format(len(cols)))
print("Memory usage at this moment :{}".format(pu.get_memory_str()))

[17:01:52] Finish joining binary matrix and cross statistics matrix. △M: +2.15GB. △T: 27.2 seconds.
Combined Matrix Shape: (8798814, 419709)
Feature Names Count: 419709
Memory usage at this moment :8.83GB


In [10]:
X = sparse.csr_matrix(X)
y = df_train['label'].values
y = (y + 1) / 2
assert X.shape[0] == y.shape[0]

In [11]:
n_splits = 3  # use 3 instead of 5 to save time
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(df_train, y)]

In [13]:
train_index, valid_index = split_indices[0]

X_train, y_train = X[train_index, :], y[train_index]
X_valid, y_valid = X[valid_index, :], y[valid_index]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train, feature_name=cols)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid, feature_name=cols)

In [14]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 120,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}
num_rounds = 500

lgbm = lgb.train(params,
                 lgb_train,
                 num_boost_round=num_rounds,
                 valid_sets=[lgb_train, lgb_valid], 
                 valid_names=['train', 'valid1'],
                 early_stopping_rounds=50)

[1]	train's auc: 0.68484	valid1's auc: 0.685018
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.691126	valid1's auc: 0.691331
[3]	train's auc: 0.694338	valid1's auc: 0.694385
[4]	train's auc: 0.696307	valid1's auc: 0.696218
[5]	train's auc: 0.697171	valid1's auc: 0.697025
[6]	train's auc: 0.698671	valid1's auc: 0.698426
[7]	train's auc: 0.700093	valid1's auc: 0.699786
[8]	train's auc: 0.701593	valid1's auc: 0.701201
[9]	train's auc: 0.702446	valid1's auc: 0.701967
[10]	train's auc: 0.703492	valid1's auc: 0.703022
[11]	train's auc: 0.705684	valid1's auc: 0.705042
[12]	train's auc: 0.706682	valid1's auc: 0.705953
[13]	train's auc: 0.708131	valid1's auc: 0.707339
[14]	train's auc: 0.710096	valid1's auc: 0.709224
[15]	train's auc: 0.712041	valid1's auc: 0.71111
[16]	train's auc: 0.712576	valid1's auc: 0.711544
[17]	train's auc: 0.714292	valid1's auc: 0.713344
[18]	train's auc: 0.71521	valid1's auc: 0.714162
[19]	train's auc: 0.717318	valid1's auc: 0.716141


[164]	train's auc: 0.781828	valid1's auc: 0.751495
[165]	train's auc: 0.781986	valid1's auc: 0.751519
[166]	train's auc: 0.782099	valid1's auc: 0.751547
[167]	train's auc: 0.782227	valid1's auc: 0.751564
[168]	train's auc: 0.782433	valid1's auc: 0.751566
[169]	train's auc: 0.782688	valid1's auc: 0.751542
[170]	train's auc: 0.782993	valid1's auc: 0.751573
[171]	train's auc: 0.783204	valid1's auc: 0.751623
[172]	train's auc: 0.783331	valid1's auc: 0.751626
[173]	train's auc: 0.783465	valid1's auc: 0.751619
[174]	train's auc: 0.78361	valid1's auc: 0.751644
[175]	train's auc: 0.78374	valid1's auc: 0.751679
[176]	train's auc: 0.783868	valid1's auc: 0.751713
[177]	train's auc: 0.784024	valid1's auc: 0.751728
[178]	train's auc: 0.784319	valid1's auc: 0.751734
[179]	train's auc: 0.784494	valid1's auc: 0.751774
[180]	train's auc: 0.784746	valid1's auc: 0.751766
[181]	train's auc: 0.784899	valid1's auc: 0.751785
[182]	train's auc: 0.7851	valid1's auc: 0.751794
[183]	train's auc: 0.785318	valid1'

[326]	train's auc: 0.808668	valid1's auc: 0.753166
[327]	train's auc: 0.808798	valid1's auc: 0.753181
[328]	train's auc: 0.808957	valid1's auc: 0.75318
[329]	train's auc: 0.809104	valid1's auc: 0.753178
[330]	train's auc: 0.809197	valid1's auc: 0.753173
[331]	train's auc: 0.809341	valid1's auc: 0.753166
[332]	train's auc: 0.809513	valid1's auc: 0.75316
[333]	train's auc: 0.809669	valid1's auc: 0.753167
[334]	train's auc: 0.809805	valid1's auc: 0.753172
[335]	train's auc: 0.809944	valid1's auc: 0.753156
[336]	train's auc: 0.810019	valid1's auc: 0.753168
[337]	train's auc: 0.810149	valid1's auc: 0.753156
[338]	train's auc: 0.810306	valid1's auc: 0.753133
[339]	train's auc: 0.810414	valid1's auc: 0.753132
[340]	train's auc: 0.810549	valid1's auc: 0.753148
[341]	train's auc: 0.810669	valid1's auc: 0.753151
[342]	train's auc: 0.810807	valid1's auc: 0.75315
[343]	train's auc: 0.810935	valid1's auc: 0.75315
[344]	train's auc: 0.811093	valid1's auc: 0.753118
[345]	train's auc: 0.811255	valid1'

In [16]:
df_feature_importance = pd.DataFrame({"feature": cols, "importance": lgbm.feature_importance()})
df_feature_importance = df_feature_importance.sort_values("importance", ascending=False)
df_feature_importance.head(30)

Unnamed: 0,feature,importance
419707,bs_clickrate_aid_x_LBS,1296
419701,bs_clickrate_aid_x_age,1038
419708,impression_aid_x_LBS,1032
419703,bs_clickrate_aid_x_education,778
419705,bs_clickrate_aid_x_consumptionAbility,763
419704,impression_aid_x_education,504
419702,impression_aid_x_age,478
419706,impression_aid_x_consumptionAbility,456
419224,creativeSize_59,274
419220,creativeSize_22,178


In [None]:
# out_folder = "../data/clickrate_bs/simple_cross/joined/preliminary_contest_data/simple/"  
# out_file = "train[userFeatureName='{}'].pkl".format(ad_feat_name, user_feat_name)
# out_path = os.path.join(out_folder, out_file)
# os.makedirs(out_folder, makedirs=True)
# df_new.to_pickle(out_path, columns=["bs_clickrate"], index=False)

In [None]:
# given DataFrame `ad_user` with 'aid' and 'uid'
# load all ad feature and join `ad_user` on 'aid'
# for each adFeature x userMultiFeature:
#     load user feature and join `ad_user` on 'uid'
#     load adFeat x userFeat clickrate
#     for each record ('aid', 'uid') in `ad_user`:
#        find all cross clickrate for the feature values 