In [1]:
import pandas as pd
import warnings
from tqdm import tqdm
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from pandas.tseries.offsets import Hour
from gensim.models import Word2Vec
import numpy as np
import gc
from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import geohash
from pandarallel import pandarallel
pandarallel.initialize()

warnings.simplefilter('ignore')
tqdm.pandas()

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

INFO: Pandarallel will run on 18 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
seed = 2020

In [3]:
# !pip install tqdm
# !pip install lightgbm

In [4]:
%%time
df_train = pd.read_csv('/home/kesci/work/raw_data/train.csv', encoding='GB18030')
df_test = pd.read_csv('/home/kesci/work/raw_data/test.csv', encoding='GB18030')
df_log = pd.read_pickle('/home/kesci/work/raw_data/md.pkl')

CPU times: user 10 s, sys: 4.2 s, total: 14.2 s
Wall time: 14.2 s


In [5]:
df_log['eventtime_date'] = pd.to_datetime(df_log['eventtime'], unit='ms') + Hour(8)
df_log['hour'] = df_log['eventtime_date'].dt.hour
df_log['eventname'] = df_log['eventname'].astype('str')

df_log.sort_values(['aopsid', 'eventtime_date'], inplace=True)
df_log.reset_index(drop=True, inplace=True)

In [6]:
df_feature = df_train.append(df_test)
df_feature.head()

Unnamed: 0,aopsid,xz,xb,x_year,is_xb
0,10000056,商交,损三,连续5年以上续保,1.0
1,10000152,商交,损三,连续4年续保,0.0
2,10000419,商交,损三,连续2年续保,1.0
3,10000465,商交,损三,连续5年续保,1.0
4,10000472,单交,单交,转保首续,1.0


In [7]:
df_eventbody = pd.read_pickle('/home/kesci/work/data/eventbody.pkl')
df_log = df_log.merge(df_eventbody, left_index=True, right_index=True)
df_log = df_log.drop(['eventbody'], axis=1)

In [8]:
df_log.head(2)

Unnamed: 0,city,country,eventname,event_alias_name,eventtime,lat,lgt,nettype,refererurltime,region,resolution,sestarttime,title,uadevice,uaname,uaos,useragent,aopsid,module_id,module_name,classify1_id,classify2_id,classify3_id,classify1,classify2,classify3,page_id,page_name,event_type,dt,eventtime_date,hour,qudao
0,广州,中国,HCZ_O00050887,开启指纹登录引导页-点击指纹验证,1590394637209,23.1167,113.25,4G,,广东,1080*2400,1590394603729,平安好车主,OXF-AN10,Other,Android,Dalvik/2.1.0 (Linux; U; Android 10; OXF-AN10 B...,16303,178,注册登录,4178,0,0,登录,,,5863,开启指纹登录引导页,点击图片类事件,20200525,2020-05-25 16:17:17.209,16,HueWeiHuiYun
1,广州,中国,119000114,点击关闭按钮,1590394651068,23.1167,113.25,4G,,广东,1080*2400,1590394603729,平安好车主,OXF-AN10,Other,Android,Dalvik/2.1.0 (Linux; U; Android 10; OXF-AN10 B...,16303,183,首页,3519,0,0,首页其他,,,3872,首页其他,点击按钮类事件,20200525,2020-05-25 16:17:31.068,16,HueWeiHuiYun


# 特征工程

In [9]:
os.makedirs('model', exist_ok=True)
os.makedirs('embedding', exist_ok=True)

# embedding
def emb_mean(df, f1, f2, emb_size=16):
    if os.path.exists('embedding/{}_{}.pkl'.format(f2, emb_size)):
        tmp = pd.read_pickle('embedding/{}_{}.pkl'.format(f2, emb_size))
        return tmp
    
    tmp = df.groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    
    if os.path.exists('model/w2v_{}_{}.m'.format(f2, emb_size)):
        model = Word2Vec.load('model/w2v_{}_{}.m'.format(f2, emb_size))
    else:
        model = Word2Vec(sentences, size=emb_size, window=5,
                     min_count=1, sg=0, hs=1, seed=seed)
        model.save('model/w2v_{}_{}.m'.format(f2, emb_size))

    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    df_emb = pd.DataFrame(emb_matrix)
    df_emb.columns = ['{}_{}_emb_{}'.format(
        f1, f2, i) for i in range(emb_size)]

    tmp = pd.concat([tmp, df_emb], axis=1)
    tmp.to_pickle('embedding/{}_{}.pkl'.format(f2, emb_size))
    
    del model, emb_matrix, sentences
    return tmp

In [10]:
def countvec_emb(df, f1, f2, emb_size=10):
    if os.path.exists('embedding/countvec_{}_{}.pkl'.format(f2, emb_size)):
        group_df = pd.read_pickle('embedding/countvec_{}_{}.pkl'.format(f2, emb_size))
        return group_df
    
    df[f2] = df[f2].astype(str)
    df[f2].fillna('-1', inplace=True)
    group_df = df.groupby([f1]).apply(
        lambda x: x[f2].tolist()).reset_index()
    group_df.columns = [f1, 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(
        f2, i) for i in range(emb_size)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    
    group_df.to_pickle('embedding/countvec_{}_{}.pkl'.format(f2, emb_size))
    
    return group_df

In [11]:
df_log['aopsid_1000'] = df_log['aopsid'] // 1000
df_feature['aopsid_1000'] = df_feature['aopsid'] // 1000

df_log['aopsid_10000'] = df_log['aopsid'] // 10000
df_feature['aopsid_10000'] = df_feature['aopsid'] // 10000

df_log['aopsid_100000'] = df_log['aopsid'] // 100000
df_feature['aopsid_100000'] = df_feature['aopsid'] // 100000

In [12]:
x_year_cnt_dict = {
    '转保': 0,
    '新车': 0,
    '转保首续': 1,
    '新车首续': 1,
    '连续2年续保': 2,
    '连续3年续保': 3,
    '连续4年续保':4,
    '连续5年续保': 5,
    '连续5年以上续保': 6,
    '退保或注销': -1,
}


df_feature['x_year_cnt'] = df_feature['x_year'].map(x_year_cnt_dict)

In [13]:
df_lstm_prob = pd.read_pickle('/home/kesci/work/prob/lstm_prob.pkl')
df_lstm_prob.columns = ['aopsid', 'lstm_pred']
df_feature = df_feature.merge(df_lstm_prob, how='left')

df_gru_prob = pd.read_pickle('/home/kesci/work/prob/gru_prob.pkl')
df_gru_prob.columns = ['aopsid', 'gru_pred']
df_feature = df_feature.merge(df_gru_prob, how='left')

In [14]:
# 众数
for f in tqdm(['uaos', 'nettype', 'uadevice', 'region', 'city', 'hour', 'qudao', 'resolution', 'lat', 'lgt']):
    df_temp = df_log.groupby(['aopsid', f]).size().reset_index()
    df_temp.drop([0], axis=1, inplace=True)
    df_temp = df_temp.sort_values(by = ['aopsid', f] ,ascending =['asc', 'asc'])
    df_temp.drop_duplicates('aopsid', keep='last', inplace=True)
    
    df_feature = df_feature.merge(df_temp, how='left')

100%|██████████| 10/10 [00:11<00:00,  1.11s/it]


In [15]:
# def geohash_encode(row):
#     prec = 4

#     if not np.isnan(row['lat']):
#         row['geohash_p_' + str(prec)] = geohash.encode(row['lat'], row['lgt'], precision=prec)
#     else:
#         row['geohash_p_' + str(prec)] = None
        
#     return row

# df_feature = df_feature.parallel_apply(geohash_encode, axis=1)
# df_feature.head()

In [16]:
df_temp = df_log.groupby('aopsid').size().reset_index()
df_temp.columns = ['aopsid', 'aopsid_behavior_count']
df_feature = df_feature.merge(df_temp, how='left')

In [17]:
# std
for f in ['hour']:
    df_temp = df_log.groupby('aopsid')[f].std().reset_index()
    df_temp.columns = ['aopsid'] + ['{}_std'.format(f)]
    df_feature = df_feature.merge(df_temp, how='left')

In [18]:
# max
for f in ['eventtime']:
    df_temp = df_log.groupby('aopsid')[f].max().reset_index()
    df_temp.columns = ['aopsid'] + ['{}_max'.format(f)]
    df_feature = df_feature.merge(df_temp, how='left')

In [19]:
df_temp = df_log.groupby(['aopsid'])['sestarttime'].nunique().reset_index()
df_temp.columns = ['aopsid', 'session_num']
df_feature = df_feature.merge(df_temp, how='left')

In [20]:
df_t = df_log[df_log['eventname'] == '11010440']
df_t = df_t.groupby(['aopsid']).size().reset_index()
df_t.columns = ['aopsid', 'aopsid_lpfw_count']
df_feature = df_feature.merge(df_t, how='left')

# 首页主页-点击用卡券
df_t = df_log[df_log['eventname'] == '119000138']
df_t = df_t.groupby(['aopsid']).size().reset_index()
df_t.columns = ['aopsid', 'aopsid_kqykq_count']
df_feature = df_feature.merge(df_t, how='left')

# 点击我的卡券列表里的卡券
df_t = df_log[df_log['eventname'] == 'HCZ_H16000517']
df_t = df_t.groupby(['aopsid']).size().reset_index()
df_t.columns = ['aopsid', 'aopsid_kqdjwdkqlbldkq_count']
df_feature = df_feature.merge(df_t, how='left')

# df_feature['aopsid_kq_count'] = df_feature['aopsid_kqykq_count'] + df_feature['aopsid_kqdjwdkqlbldkq_count']

In [21]:
for f1, f2, dim in tqdm([['aopsid', 'title', 16], ['aopsid', 'eventname', 16], ['aopsid', 'module_id', 16]]):
    df_feature = df_feature.merge(emb_mean(df_log, f1, f2, dim), on=f1, how='left')

100%|██████████| 3/3 [00:00<00:00,  6.71it/s]


In [22]:
# for f1, f2, dim in tqdm([['aopsid', 'title', 16], ['aopsid', 'eventname', 16], ['aopsid', 'module_id', 16]]):
#     df_feature = df_feature.merge(countvec_emb(df_log, f1, f2, dim), on=f1, how='left')

In [23]:
# label encoder
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge
    
def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['region'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['x_year'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['uadevice'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['uadevice', 'x_year'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['aopsid_10000'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['lat'], {'is_xb': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['lgt'], {'is_xb': ['mean']})
 
    return df_unknow

df_train = df_feature[~df_feature['is_xb'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['is_xb'].isnull()]
df_test = df_test.reset_index(drop=True)
   
df_stas_feat = None
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for train_index, val_index in kf.split(df_train, df_train['is_xb']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]
    
    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)
    
    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

0

In [43]:
df_feature.head()

Unnamed: 0,aopsid,xz,xb,x_year,is_xb,aopsid_1000,aopsid_10000,aopsid_100000,x_year_cnt,lstm_pred,gru_pred,uaos,nettype,uadevice,region,city,hour,qudao,resolution,lat,lgt,aopsid_behavior_count,hour_std,eventtime_max,session_num,aopsid_lpfw_count,aopsid_kqykq_count,aopsid_kqdjwdkqlbldkq_count,aopsid_kq_count,aopsid_title_emb_0,aopsid_title_emb_1,aopsid_title_emb_2,aopsid_title_emb_3,aopsid_title_emb_4,aopsid_title_emb_5,aopsid_title_emb_6,aopsid_title_emb_7,aopsid_title_emb_8,aopsid_title_emb_9,aopsid_title_emb_10,aopsid_title_emb_11,aopsid_title_emb_12,aopsid_title_emb_13,aopsid_title_emb_14,aopsid_title_emb_15,aopsid_eventname_emb_0,aopsid_eventname_emb_1,aopsid_eventname_emb_2,aopsid_eventname_emb_3,aopsid_eventname_emb_4,aopsid_eventname_emb_5,aopsid_eventname_emb_6,aopsid_eventname_emb_7,aopsid_eventname_emb_8,aopsid_eventname_emb_9,aopsid_eventname_emb_10,aopsid_eventname_emb_11,aopsid_eventname_emb_12,aopsid_eventname_emb_13,aopsid_eventname_emb_14,aopsid_eventname_emb_15,aopsid_module_id_emb_0,aopsid_module_id_emb_1,aopsid_module_id_emb_2,aopsid_module_id_emb_3,aopsid_module_id_emb_4,aopsid_module_id_emb_5,aopsid_module_id_emb_6,aopsid_module_id_emb_7,aopsid_module_id_emb_8,aopsid_module_id_emb_9,aopsid_module_id_emb_10,aopsid_module_id_emb_11,aopsid_module_id_emb_12,aopsid_module_id_emb_13,aopsid_module_id_emb_14,aopsid_module_id_emb_15,region_is_xb_mean,x_year_is_xb_mean,uadevice_is_xb_mean,uadevice_x_year_is_xb_mean,aopsid_10000_is_xb_mean,lat_is_xb_mean,lgt_is_xb_mean
0,10000855,2,4,8,1.0,10000,1000,100,6.0,0.710813,0.637092,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.844435,,,0.904762,,
1,10002155,2,4,7,1.0,10002,1000,100,4.0,0.720984,0.630133,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.852398,,,0.904762,,
2,10005745,2,4,8,1.0,10005,1000,100,6.0,0.710813,0.637092,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.844435,,,0.904762,,
3,10007191,2,4,9,1.0,10007,1000,100,5.0,0.712727,0.651464,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.853152,,,0.904762,,
4,10007701,2,1,3,0.0,10007,1000,100,0.0,0.601563,0.568157,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.710814,,,0.904762,,


In [25]:
df_feature.to_pickle('/home/kesci/work/data/feature.pkl')

In [39]:
df_feature['is_xb'].value_counts()

1.0    120044
0.0     32870
Name: is_xb, dtype: int64

In [44]:
df_feature[df_feature['is_xb'].notnull()]['lat'].notnull().sum()

55084

# 模型训练

In [26]:
os.makedirs('/home/kesci/work/sub', exist_ok=True)

In [27]:
for f in df_feature.select_dtypes('object').columns:
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

In [28]:
df_feature.head()

Unnamed: 0,aopsid,xz,xb,x_year,is_xb,aopsid_1000,aopsid_10000,aopsid_100000,x_year_cnt,lstm_pred,gru_pred,uaos,nettype,uadevice,region,city,hour,qudao,resolution,lat,lgt,aopsid_behavior_count,hour_std,eventtime_max,session_num,aopsid_lpfw_count,aopsid_kqykq_count,aopsid_kqdjwdkqlbldkq_count,aopsid_kq_count,aopsid_title_emb_0,aopsid_title_emb_1,aopsid_title_emb_2,aopsid_title_emb_3,aopsid_title_emb_4,aopsid_title_emb_5,aopsid_title_emb_6,aopsid_title_emb_7,aopsid_title_emb_8,aopsid_title_emb_9,aopsid_title_emb_10,aopsid_title_emb_11,aopsid_title_emb_12,aopsid_title_emb_13,aopsid_title_emb_14,aopsid_title_emb_15,aopsid_eventname_emb_0,aopsid_eventname_emb_1,aopsid_eventname_emb_2,aopsid_eventname_emb_3,aopsid_eventname_emb_4,aopsid_eventname_emb_5,aopsid_eventname_emb_6,aopsid_eventname_emb_7,aopsid_eventname_emb_8,aopsid_eventname_emb_9,aopsid_eventname_emb_10,aopsid_eventname_emb_11,aopsid_eventname_emb_12,aopsid_eventname_emb_13,aopsid_eventname_emb_14,aopsid_eventname_emb_15,aopsid_module_id_emb_0,aopsid_module_id_emb_1,aopsid_module_id_emb_2,aopsid_module_id_emb_3,aopsid_module_id_emb_4,aopsid_module_id_emb_5,aopsid_module_id_emb_6,aopsid_module_id_emb_7,aopsid_module_id_emb_8,aopsid_module_id_emb_9,aopsid_module_id_emb_10,aopsid_module_id_emb_11,aopsid_module_id_emb_12,aopsid_module_id_emb_13,aopsid_module_id_emb_14,aopsid_module_id_emb_15,region_is_xb_mean,x_year_is_xb_mean,uadevice_is_xb_mean,uadevice_x_year_is_xb_mean,aopsid_10000_is_xb_mean,lat_is_xb_mean,lgt_is_xb_mean
0,10000855,2,4,8,1.0,10000,1000,100,6.0,0.710813,0.637092,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.844435,,,0.904762,,
1,10002155,2,4,7,1.0,10002,1000,100,4.0,0.720984,0.630133,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.852398,,,0.904762,,
2,10005745,2,4,8,1.0,10005,1000,100,6.0,0.710813,0.637092,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.844435,,,0.904762,,
3,10007191,2,4,9,1.0,10007,1000,100,5.0,0.712727,0.651464,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.853152,,,0.904762,,
4,10007701,2,1,3,0.0,10007,1000,100,0.0,0.601563,0.568157,6,9,1140,12,41,,13,187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.710814,,,0.904762,,


In [29]:
df_train = df_feature[df_feature.is_xb.notna()].copy()
df_test = df_feature[df_feature.is_xb.isna()].copy()

df_train.shape, df_test.shape

((152914, 84), (100965, 84))

In [30]:
ycol = 'is_xb'
feature_names = list(
    filter(lambda x: x not in [ycol], df_train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=10,
                           reg_lambda=12,
                           random_state=seed,
                           is_unbalance=True,
                           metric='auc')

df_oof = pd.DataFrame()
prediction = df_test[['aopsid']]
prediction['pred'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          early_stopping_rounds=100)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    oof = df_train.iloc[val_idx][['aopsid', ycol]].copy()
    oof['pred'] = pred_val
    df_oof = df_oof.append(oof)
    
    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['pred'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val



Training until validation scores don't improve for 100 rounds
[100]	train's auc: 0.666952	valid's auc: 0.650294
[200]	train's auc: 0.671475	valid's auc: 0.651942
[300]	train's auc: 0.675785	valid's auc: 0.65285
[400]	train's auc: 0.678752	valid's auc: 0.653259
[500]	train's auc: 0.682362	valid's auc: 0.653692
[600]	train's auc: 0.686048	valid's auc: 0.6539
[700]	train's auc: 0.689524	valid's auc: 0.654036
[800]	train's auc: 0.692814	valid's auc: 0.654082
[900]	train's auc: 0.696178	valid's auc: 0.654024
Early stopping, best iteration is:
[803]	train's auc: 0.692894	valid's auc: 0.654106


Training until validation scores don't improve for 100 rounds
[100]	train's auc: 0.666291	valid's auc: 0.65358
[200]	train's auc: 0.670703	valid's auc: 0.655305
[300]	train's auc: 0.675026	valid's auc: 0.656434
[400]	train's auc: 0.677898	valid's auc: 0.656914
[500]	train's auc: 0.681557	valid's auc: 0.657181
[600]	train's auc: 0.685398	valid's auc: 0.657298
[700]	train's auc: 0.68912	valid's auc: 0

In [31]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,aopsid_10000_is_xb_mean,863.4
1,aopsid_10000,806.4
2,lstm_pred,795.8
3,aopsid,772.4
4,aopsid_1000,726.0
5,x_year_is_xb_mean,721.2
6,gru_pred,715.6
7,aopsid_100000,668.8
8,x_year_cnt,525.2
9,xb,473.2


In [32]:
auc = roc_auc_score(df_oof[ycol].values, df_oof['pred'].values)
print('auc:', auc)

auc: 0.6608174475058366


In [33]:
# 0.6607929416094739
# 0.6608212446633882
# 0.6608387430642635

In [34]:
df_oof = df_oof.sort_values('aopsid')
df_oof.head()

Unnamed: 0,aopsid,is_xb,pred
3925,16303,1.0,0.644267
5996,26515,0.0,0.513874
5985,26695,1.0,0.60922
6152,28071,1.0,0.612937
8701,46678,1.0,0.718922


In [35]:
df_oof.to_csv('/home/kesci/work/prob/oof_seed_{}.csv'.format(seed), index=False)
prediction[['aopsid', 'pred']].to_csv('prob/sub_seed_{}.csv'.format(seed), index=False)

In [36]:
prediction['pred'] = 1 - prediction['pred']
prediction[['aopsid', 'pred']].to_csv('/home/kesci/work/sub/baoxian_{}.csv'.format(auc), index=False)
prediction[['aopsid', 'pred']].to_csv('/home/kesci/work/sub/sub.txt', index=False, encoding='utf-8', sep='\t')