In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os
from scipy.sparse import hstack

In [2]:
data_dir = '../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
df_train = load("train.csv")

In [9]:
uids, (ufeat_index, uvec) = load_user_cnt("interest1")
aids, (afeat_index, avec) = load_ad_cnt('aid')
uid_to_index = dict(zip(uids, list(range(len(uids)))))  # mapping from uids to distinct indices

In [10]:
for aid in aids:
    df_selected = df_train[df_train['aid']==aid]
    u_index = df_selected['uid'].map(uid_to_index).values
    
    valid_ratio = 0.3
    train_size = int(df_selected.shape[0] * (1 - valid_ratio))
    
    X = uvec[u_index,:]
    y = (df_selected['label'].values + 1) / 2
    X_train, y_train = X[:train_size], y[:train_size]
    X_valid, y_valid = X[train_size:], y[train_size:]
    
    lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
    lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 3,
        'num_leaves': 7,
        'learning_rate': 0.05,
        # 'feature_fraction': 0.9,
        # 'bagging_fraction': 0.8,
        'verbose': 0
    }
    
    print("Training LightGBM on [aid='{}']...".format(aid))
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=[lgb_train, lgb_valid], 
                    valid_names=['train', 'valid1'],
                    early_stopping_rounds=15,
                    verbose_eval=10)
    print()

Training LightGBM on [aid='177']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.616652	valid1's auc: 0.546509
Early stopping, best iteration is:
[3]	train's auc: 0.616404	valid1's auc: 0.54939

Training LightGBM on [aid='2050']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.646845	valid1's auc: 0.63354
Early stopping, best iteration is:
[4]	train's auc: 0.636516	valid1's auc: 0.634415

Training LightGBM on [aid='1716']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.594638	valid1's auc: 0.585364
[20]	train's auc: 0.599973	valid1's auc: 0.591918
[30]	train's auc: 0.605756	valid1's auc: 0.595951
[40]	train's auc: 0.605321	valid1's auc: 0.598876
[50]	train's auc: 0.607128	valid1's auc: 0.597948
[60]	train's auc: 0.609635	valid1's auc: 0.598187
Early stopping, best iteration is:
[46]	train's auc: 0.606983	valid1's auc: 0.599751

Training LightGBM on [aid='336']...
Training until

[10]	train's auc: 0.66867	valid1's auc: 0.653312
[20]	train's auc: 0.669417	valid1's auc: 0.653334
[30]	train's auc: 0.671532	valid1's auc: 0.65515
[40]	train's auc: 0.674586	valid1's auc: 0.657028
[50]	train's auc: 0.677678	valid1's auc: 0.660734
[60]	train's auc: 0.679361	valid1's auc: 0.661563
[70]	train's auc: 0.681887	valid1's auc: 0.663743
[80]	train's auc: 0.685636	valid1's auc: 0.665267
[90]	train's auc: 0.689098	valid1's auc: 0.666035
[100]	train's auc: 0.691088	valid1's auc: 0.66637

Training LightGBM on [aid='1254']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.577546	valid1's auc: 0.557953
[20]	train's auc: 0.580188	valid1's auc: 0.553629
Early stopping, best iteration is:
[7]	train's auc: 0.578904	valid1's auc: 0.560059

Training LightGBM on [aid='231']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.555629	valid1's auc: 0.535574
[20]	train's auc: 0.560471	valid1's auc: 0.538502
[30]	train's auc: 0

[70]	train's auc: 0.681972	valid1's auc: 0.605277
[80]	train's auc: 0.68844	valid1's auc: 0.600157
Early stopping, best iteration is:
[69]	train's auc: 0.681547	valid1's auc: 0.606319

Training LightGBM on [aid='1350']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.573627	valid1's auc: 0.502897
Early stopping, best iteration is:
[4]	train's auc: 0.566558	valid1's auc: 0.505607

Training LightGBM on [aid='1415']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.56503	valid1's auc: 0.560282
Early stopping, best iteration is:
[4]	train's auc: 0.56503	valid1's auc: 0.560282

Training LightGBM on [aid='420']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.596531	valid1's auc: 0.584546
[20]	train's auc: 0.60119	valid1's auc: 0.586793
[30]	train's auc: 0.605194	valid1's auc: 0.586098
Early stopping, best iteration is:
[23]	train's auc: 0.602312	valid1's auc: 0.58753

Training LightGBM

[30]	train's auc: 0.638658	valid1's auc: 0.635543
Early stopping, best iteration is:
[24]	train's auc: 0.638658	valid1's auc: 0.635543

Training LightGBM on [aid='681']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.639233	valid1's auc: 0.604421
[20]	train's auc: 0.646834	valid1's auc: 0.606683
[30]	train's auc: 0.65813	valid1's auc: 0.611642
[40]	train's auc: 0.66307	valid1's auc: 0.605858
Early stopping, best iteration is:
[30]	train's auc: 0.65813	valid1's auc: 0.611642

Training LightGBM on [aid='1957']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.69803	valid1's auc: 0.656058
[20]	train's auc: 0.700054	valid1's auc: 0.658159
[30]	train's auc: 0.701363	valid1's auc: 0.659199
[40]	train's auc: 0.707904	valid1's auc: 0.662924
[50]	train's auc: 0.710897	valid1's auc: 0.661981
[60]	train's auc: 0.712023	valid1's auc: 0.661754
[70]	train's auc: 0.719802	valid1's auc: 0.663037
[80]	train's auc: 0.725543	valid1's

Early stopping, best iteration is:
[49]	train's auc: 0.700449	valid1's auc: 0.703744

Training LightGBM on [aid='966']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.577128	valid1's auc: 0.55381
[20]	train's auc: 0.582819	valid1's auc: 0.558917
[30]	train's auc: 0.587175	valid1's auc: 0.558081
Early stopping, best iteration is:
[20]	train's auc: 0.582819	valid1's auc: 0.558917

Training LightGBM on [aid='2216']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.582195	valid1's auc: 0.518962
Early stopping, best iteration is:
[4]	train's auc: 0.576924	valid1's auc: 0.522551

Training LightGBM on [aid='1904']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.559008	valid1's auc: 0.539434
Early stopping, best iteration is:
[1]	train's auc: 0.55025	valid1's auc: 0.540939

Training LightGBM on [aid='1277']...
Training until validation scores don't improve for 15 rounds.
[10]	train's au

[40]	train's auc: 0.627022	valid1's auc: 0.608242
[50]	train's auc: 0.630379	valid1's auc: 0.612535
[60]	train's auc: 0.633398	valid1's auc: 0.614703
[70]	train's auc: 0.636418	valid1's auc: 0.616053
[80]	train's auc: 0.639205	valid1's auc: 0.617527
[90]	train's auc: 0.641518	valid1's auc: 0.619115
[100]	train's auc: 0.645624	valid1's auc: 0.621552

Training LightGBM on [aid='727']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.591834	valid1's auc: 0.576823
[20]	train's auc: 0.595176	valid1's auc: 0.579594
[30]	train's auc: 0.598212	valid1's auc: 0.58172
[40]	train's auc: 0.60239	valid1's auc: 0.590449
[50]	train's auc: 0.608365	valid1's auc: 0.59139
[60]	train's auc: 0.615004	valid1's auc: 0.595859
[70]	train's auc: 0.619749	valid1's auc: 0.597679
[80]	train's auc: 0.622884	valid1's auc: 0.597518
Early stopping, best iteration is:
[72]	train's auc: 0.620401	valid1's auc: 0.59804

Training LightGBM on [aid='699']...
Training until validation scores

[50]	train's auc: 0.705082	valid1's auc: 0.70274
[60]	train's auc: 0.70931	valid1's auc: 0.704857
[70]	train's auc: 0.714287	valid1's auc: 0.708183
[80]	train's auc: 0.71846	valid1's auc: 0.709101
[90]	train's auc: 0.721824	valid1's auc: 0.709443
[100]	train's auc: 0.725074	valid1's auc: 0.709979

Training LightGBM on [aid='613']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.762303	valid1's auc: 0.74011
Early stopping, best iteration is:
[1]	train's auc: 0.757044	valid1's auc: 0.740694

Training LightGBM on [aid='1746']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.685252	valid1's auc: 0.566582
Early stopping, best iteration is:
[3]	train's auc: 0.647864	valid1's auc: 0.573403

Training LightGBM on [aid='1790']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.655876	valid1's auc: 0.63841
[20]	train's auc: 0.6593	valid1's auc: 0.636547
[30]	train's auc: 0.66579	valid1's auc:

Training LightGBM on [aid='1841']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.612731	valid1's auc: 0.563593
Early stopping, best iteration is:
[3]	train's auc: 0.612525	valid1's auc: 0.570515

Training LightGBM on [aid='6']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.597148	valid1's auc: 0.516151
[20]	train's auc: 0.613087	valid1's auc: 0.51768
Early stopping, best iteration is:
[14]	train's auc: 0.608723	valid1's auc: 0.520223

Training LightGBM on [aid='516']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.604426	valid1's auc: 0.511146
Early stopping, best iteration is:
[3]	train's auc: 0.58208	valid1's auc: 0.526495

Training LightGBM on [aid='2066']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.588314	valid1's auc: 0.536139
Early stopping, best iteration is:
[3]	train's auc: 0.573309	valid1's auc: 0.547556

Training LightGBM o

Training LightGBM on [aid='2044']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.620512	valid1's auc: 0.526944
Early stopping, best iteration is:
[1]	train's auc: 0.583255	valid1's auc: 0.544973

Training LightGBM on [aid='1085']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.669229	valid1's auc: 0.528933
Early stopping, best iteration is:
[1]	train's auc: 0.617029	valid1's auc: 0.549589

Training LightGBM on [aid='454']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.646952	valid1's auc: 0.62203
[20]	train's auc: 0.646785	valid1's auc: 0.621836
Early stopping, best iteration is:
[8]	train's auc: 0.646467	valid1's auc: 0.622142

Training LightGBM on [aid='916']...
Training until validation scores don't improve for 15 rounds.
[10]	train's auc: 0.661321	valid1's auc: 0.651017
[20]	train's auc: 0.66348	valid1's auc: 0.652318
[30]	train's auc: 0.665339	valid1's auc: 0.653204
[40