In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import tqdm
import sys
import os
import gc
import argparse
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', None)  # 显示所有行
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('expand_frame_repr', False)  # 即“禁止换行”
# pd.set_option('display.precision', 2) #展示两位小数点
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
trans_info = pd.read_csv('./data/账户交易信息.csv')
static_info = pd.read_csv('./data/账户静态信息.csv')

train_label = pd.read_csv('./data/训练集标签.csv')
test_label = pd.read_csv('./data/test_dataset.csv')

In [4]:
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[col] = pd.to_datetime(df_copy[col], format='%Y-%m-%d')
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofyear'] = df_copy[col].dt.dayofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = (df_copy[col].dt.dayofweek + 1) // 6
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    df_copy[prefix + 'hour'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0]))
    df_copy[prefix + 'minu'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[1]))
    df_copy[prefix + 'date'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    
    return df_copy   

trans_info = get_time_feature(trans_info, "jyrq")
time_cols = [f for f in trans_info.columns if 'jyrq_' in f]
print(time_cols)
print(train_label.shape, test_label.shape)

['jyrq_month', 'jyrq_day', 'jyrq_weekofyear', 'jyrq_dayofyear', 'jyrq_dayofweek', 'jyrq_is_wknd', 'jyrq_is_month_start', 'jyrq_is_month_end', 'jyrq_hour', 'jyrq_minu', 'jyrq_date']
(1200, 2) (4800, 1)


In [5]:
%%time
def get_base_feat(df1_, df2_):
    df1 = df1_.copy() # 构建特征数据 
    df2 = df2_.copy()

    agg_func = {
        # 对方账号
        'dfzh': ['nunique','count'],
        # 对方行号
        'dfhh': ['nunique'],
        # 交易渠道
        'jyqd': ['nunique'],
        # 摘要代号
        'zydh': ['nunique'],
        # 交易金额
        'jyje': ['sum','mean','max','min','std',np.ptp],
        # 账户余额
        'zhye': ['sum','mean','max','min','std',np.ptp],
        # 对方名称长度
        'dfmccd': ['mean','max','min','std',np.ptp],
    }
    
    for col in time_cols:
        agg_func[col] = ['mean','min','max',np.ptp]
    
    agg_df = df1[df1['jdbj']==0].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj0_' + '_'.join(f).strip() for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    agg_df = df1[df1['jdbj']==1].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj1_' + '_'.join(f).strip() for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    return df2

train_label = get_base_feat(trans_info, train_label)
test_label = get_base_feat(trans_info, test_label)

Wall time: 28.5 s


In [6]:
trans_info.head()

Unnamed: 0,jylsxh,zhdh,dfzh,jdbj,jyje,zhye,dfhh,jyrq,jysj,jyqd,zydh,dfmccd,jyrq_month,jyrq_day,jyrq_weekofyear,jyrq_dayofyear,jyrq_dayofweek,jyrq_is_wknd,jyrq_is_month_start,jyrq_is_month_end,jyrq_hour,jyrq_minu,jyrq_date
0,5D252156AE9F6B6595A1C56F56D4F91C,86C379D938234BAA,14BEFED1370B730A,0,310.0,57806.83,834E1F06,2020-03-01,00:18:06,E96ED478,4E0CB6FB,45,3,1,9,61,6,1,1,0,0,18,18
1,8BB3D82CA8E5F95577CA3E2DF432DF64,8EB373F073727157,FD7F11B33576339B,1,599.99,7099.73,B3D461D4,2020-03-01,00:18:17,621461AF,A3C65C29,6,3,1,9,61,6,1,1,0,0,18,18
2,412B7E903BC06882EEB9FB6A484D0773,997DED969A377D40,014F2782648E7FDA,1,4000.0,34448.04,A71C76B8,2020-03-01,00:18:30,621461AF,A3C65C29,6,3,1,9,61,6,1,1,0,0,18,18
3,F1122F893AC75DC8751190C67E1C3DB6,8EB373F073727157,129FAF9FD9D03346,1,299.98,7399.71,8A1BC467,2020-03-01,00:19:06,621461AF,A3C65C29,6,3,1,9,61,6,1,1,0,0,19,19
4,8BD9575EA55E67D4E99AC43B2A444172,8EB373F073727157,3B9CD92F13274EBA,1,999.96,8399.67,A8DA3378,2020-03-01,00:19:17,091D584F,2618045A,6,3,1,9,61,6,1,1,0,0,19,19


In [7]:
%%time
# 合并账户静态信息
static_info['khrq']  = pd.to_datetime(static_info['khrq'], format='%Y-%m-%d')
static_info['year']  = static_info['khrq'].dt.year
static_info['month'] = static_info['khrq'].dt.month
static_info['day']   = static_info['khrq'].dt.day

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

# 开户机构代号
for col in ['khjgdh']:
    static_info[col] = label_encode(static_info[col])

keep_cols = ['zhdh','year','month','day','khjgdh','xb','年龄']

train_label = train_label.merge(static_info[keep_cols], on=['zhdh'], how='left')
test_label  = test_label.merge(static_info[keep_cols], on=['zhdh'], how='left')

Wall time: 13.7 ms


In [8]:
train_label.head()

Unnamed: 0,zhdh,black_flag,zhdh_jdbj0_dfzh_nunique,zhdh_jdbj0_dfzh_count,zhdh_jdbj0_dfhh_nunique,zhdh_jdbj0_jyqd_nunique,zhdh_jdbj0_zydh_nunique,zhdh_jdbj0_jyje_sum,zhdh_jdbj0_jyje_mean,zhdh_jdbj0_jyje_max,zhdh_jdbj0_jyje_min,zhdh_jdbj0_jyje_std,zhdh_jdbj0_jyje_ptp,zhdh_jdbj0_zhye_sum,zhdh_jdbj0_zhye_mean,zhdh_jdbj0_zhye_max,zhdh_jdbj0_zhye_min,zhdh_jdbj0_zhye_std,zhdh_jdbj0_zhye_ptp,zhdh_jdbj0_dfmccd_mean,zhdh_jdbj0_dfmccd_max,zhdh_jdbj0_dfmccd_min,zhdh_jdbj0_dfmccd_std,zhdh_jdbj0_dfmccd_ptp,zhdh_jdbj0_jyrq_month_mean,zhdh_jdbj0_jyrq_month_min,zhdh_jdbj0_jyrq_month_max,zhdh_jdbj0_jyrq_month_ptp,zhdh_jdbj0_jyrq_day_mean,zhdh_jdbj0_jyrq_day_min,zhdh_jdbj0_jyrq_day_max,zhdh_jdbj0_jyrq_day_ptp,zhdh_jdbj0_jyrq_weekofyear_mean,zhdh_jdbj0_jyrq_weekofyear_min,zhdh_jdbj0_jyrq_weekofyear_max,zhdh_jdbj0_jyrq_weekofyear_ptp,zhdh_jdbj0_jyrq_dayofyear_mean,zhdh_jdbj0_jyrq_dayofyear_min,zhdh_jdbj0_jyrq_dayofyear_max,zhdh_jdbj0_jyrq_dayofyear_ptp,zhdh_jdbj0_jyrq_dayofweek_mean,zhdh_jdbj0_jyrq_dayofweek_min,zhdh_jdbj0_jyrq_dayofweek_max,zhdh_jdbj0_jyrq_dayofweek_ptp,zhdh_jdbj0_jyrq_is_wknd_mean,zhdh_jdbj0_jyrq_is_wknd_min,zhdh_jdbj0_jyrq_is_wknd_max,zhdh_jdbj0_jyrq_is_wknd_ptp,zhdh_jdbj0_jyrq_is_month_start_mean,zhdh_jdbj0_jyrq_is_month_start_min,zhdh_jdbj0_jyrq_is_month_start_max,zhdh_jdbj0_jyrq_is_month_start_ptp,zhdh_jdbj0_jyrq_is_month_end_mean,zhdh_jdbj0_jyrq_is_month_end_min,zhdh_jdbj0_jyrq_is_month_end_max,zhdh_jdbj0_jyrq_is_month_end_ptp,zhdh_jdbj0_jyrq_hour_mean,zhdh_jdbj0_jyrq_hour_min,zhdh_jdbj0_jyrq_hour_max,zhdh_jdbj0_jyrq_hour_ptp,zhdh_jdbj0_jyrq_minu_mean,zhdh_jdbj0_jyrq_minu_min,zhdh_jdbj0_jyrq_minu_max,zhdh_jdbj0_jyrq_minu_ptp,zhdh_jdbj0_jyrq_date_mean,zhdh_jdbj0_jyrq_date_min,zhdh_jdbj0_jyrq_date_max,zhdh_jdbj0_jyrq_date_ptp,zhdh_jdbj1_dfzh_nunique,zhdh_jdbj1_dfzh_count,zhdh_jdbj1_dfhh_nunique,zhdh_jdbj1_jyqd_nunique,zhdh_jdbj1_zydh_nunique,zhdh_jdbj1_jyje_sum,zhdh_jdbj1_jyje_mean,zhdh_jdbj1_jyje_max,zhdh_jdbj1_jyje_min,zhdh_jdbj1_jyje_std,zhdh_jdbj1_jyje_ptp,zhdh_jdbj1_zhye_sum,zhdh_jdbj1_zhye_mean,zhdh_jdbj1_zhye_max,zhdh_jdbj1_zhye_min,zhdh_jdbj1_zhye_std,zhdh_jdbj1_zhye_ptp,zhdh_jdbj1_dfmccd_mean,zhdh_jdbj1_dfmccd_max,zhdh_jdbj1_dfmccd_min,zhdh_jdbj1_dfmccd_std,zhdh_jdbj1_dfmccd_ptp,zhdh_jdbj1_jyrq_month_mean,zhdh_jdbj1_jyrq_month_min,zhdh_jdbj1_jyrq_month_max,zhdh_jdbj1_jyrq_month_ptp,zhdh_jdbj1_jyrq_day_mean,zhdh_jdbj1_jyrq_day_min,zhdh_jdbj1_jyrq_day_max,zhdh_jdbj1_jyrq_day_ptp,zhdh_jdbj1_jyrq_weekofyear_mean,zhdh_jdbj1_jyrq_weekofyear_min,zhdh_jdbj1_jyrq_weekofyear_max,zhdh_jdbj1_jyrq_weekofyear_ptp,zhdh_jdbj1_jyrq_dayofyear_mean,zhdh_jdbj1_jyrq_dayofyear_min,zhdh_jdbj1_jyrq_dayofyear_max,zhdh_jdbj1_jyrq_dayofyear_ptp,zhdh_jdbj1_jyrq_dayofweek_mean,zhdh_jdbj1_jyrq_dayofweek_min,zhdh_jdbj1_jyrq_dayofweek_max,zhdh_jdbj1_jyrq_dayofweek_ptp,zhdh_jdbj1_jyrq_is_wknd_mean,zhdh_jdbj1_jyrq_is_wknd_min,zhdh_jdbj1_jyrq_is_wknd_max,zhdh_jdbj1_jyrq_is_wknd_ptp,zhdh_jdbj1_jyrq_is_month_start_mean,zhdh_jdbj1_jyrq_is_month_start_min,zhdh_jdbj1_jyrq_is_month_start_max,zhdh_jdbj1_jyrq_is_month_start_ptp,zhdh_jdbj1_jyrq_is_month_end_mean,zhdh_jdbj1_jyrq_is_month_end_min,zhdh_jdbj1_jyrq_is_month_end_max,zhdh_jdbj1_jyrq_is_month_end_ptp,zhdh_jdbj1_jyrq_hour_mean,zhdh_jdbj1_jyrq_hour_min,zhdh_jdbj1_jyrq_hour_max,zhdh_jdbj1_jyrq_hour_ptp,zhdh_jdbj1_jyrq_minu_mean,zhdh_jdbj1_jyrq_minu_min,zhdh_jdbj1_jyrq_minu_max,zhdh_jdbj1_jyrq_minu_ptp,zhdh_jdbj1_jyrq_date_mean,zhdh_jdbj1_jyrq_date_min,zhdh_jdbj1_jyrq_date_max,zhdh_jdbj1_jyrq_date_ptp,year,month,day,khjgdh,xb,年龄
0,2029FF26D4E2CA79,0,2,15,2,2,2,212093.3,14139.55,116724.0,500.0,28937.85,116224.0,9201259.08,613417.27,735585.39,418851.49,117111.85,316733.9,4.8,6,0,2.48,6,4.27,3,5,2,13.67,1,29,28,16.8,11,22,11,112.53,71,149,78,2.93,0,6,6,0.2,0,1,1,0.07,0,1,1,0.0,0,0,0,12.6,3,21,18,29.4,2,56,54,785.4,233,1314,1081,2.0,13.0,2.0,2.0,2.0,566358.37,43566.03,120000.0,1288.14,48893.58,118711.86,7668300.93,589869.3,755477.54,365859.88,143758.73,389617.66,4.62,6.0,0.0,2.63,6.0,4.08,3.0,5.0,2.0,7.54,1.0,26.0,25.0,15.0,9.0,22.0,13.0,100.54,61.0,147.0,86.0,3.54,0.0,6.0,6.0,0.31,0.0,1.0,1.0,0.31,0.0,1.0,1.0,0.0,0.0,0.0,0.0,12.08,0.0,20.0,20.0,36.0,4.0,53.0,49.0,760.62,34.0,1204.0,1170.0,2018,3,28,42,1,40
1,09F5B90D46FB7CCE,1,12,31,11,5,7,634809.54,20477.73,100000.0,-100000.0,38779.84,200000.0,362731.96,11701.03,116299.44,0.19,25148.07,116299.25,10.26,45,0,8.82,45,4.23,3,5,2,18.45,1,30,29,17.23,12,20,8,116.16,77,137,60,3.58,1,6,5,0.16,0,1,1,0.16,0,1,1,0.26,0,1,1,11.29,3,16,13,33.0,0,59,59,710.42,185,978,793,84.0,98.0,17.0,4.0,4.0,638908.0,6519.47,52500.0,100.0,9018.45,52400.0,4819268.57,49176.21,173292.44,340.44,45662.47,172952.0,9.7,33.0,6.0,5.55,27.0,4.27,3.0,5.0,2.0,18.06,1.0,30.0,29.0,17.38,12.0,18.0,6.0,117.01,80.0,122.0,42.0,3.37,2.0,5.0,3.0,0.18,0.0,1.0,1.0,0.28,0.0,1.0,1.0,0.2,0.0,1.0,1.0,12.1,9.0,16.0,7.0,26.93,0.0,57.0,57.0,753.05,542.0,976.0,434.0,2018,2,27,155,0,39
2,E0C880EB18F4EFE8,1,7,23,6,5,5,1083609.1,47113.44,210000.0,0.1,66230.8,209999.9,3705.62,161.11,658.5,0.91,245.15,657.59,25.43,30,9,7.72,21,4.87,3,5,2,11.39,9,19,10,19.3,12,20,8,128.43,78,132,54,1.3,0,6,6,0.22,0,1,1,0.0,0,0,0,0.0,0,0,0,9.13,0,18,18,29.09,1,55,54,576.91,19,1111,1092,13.0,16.0,9.0,2.0,2.0,1082972.0,67685.75,200000.0,1.0,61249.93,199999.0,1234410.56,77150.66,210062.91,22.91,70690.05,210040.0,11.44,33.0,6.0,8.5,27.0,5.0,5.0,5.0,0.0,11.06,9.0,14.0,5.0,19.94,19.0,20.0,1.0,132.06,130.0,135.0,5.0,0.5,0.0,5.0,5.0,0.06,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.88,9.0,18.0,9.0,30.06,0.0,59.0,59.0,622.56,545.0,1111.0,566.0,2019,5,25,374,0,24
3,ED8FFDCCF93C2F11,1,4,9,2,2,5,36105.5,4011.72,24400.0,-2.0,7913.63,24402.0,3439.75,382.19,3140.75,4.75,1035.57,3136.0,12.0,45,0,18.91,45,4.67,4,5,1,12.11,3,30,27,18.11,18,19,1,123.11,121,125,4,4.33,0,6,6,0.56,0,1,1,0.0,0,0,0,0.33,0,1,1,15.89,0,22,22,14.0,0,53,53,967.33,3,1339,1336,28.0,36.0,11.0,3.0,3.0,36237.0,1006.58,6000.0,10.0,1459.22,5990.0,347005.5,9639.04,24431.75,10.25,7753.12,24421.5,8.25,12.0,6.0,1.66,6.0,4.97,4.0,5.0,1.0,3.78,3.0,30.0,27.0,18.03,18.0,19.0,1.0,123.94,121.0,125.0,4.0,5.75,0.0,6.0,6.0,0.94,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.03,0.0,1.0,1.0,21.39,10.0,23.0,13.0,39.06,5.0,58.0,53.0,1322.39,654.0,1391.0,737.0,2017,1,17,0,0,30
4,8D2F48B0BFC40AB2,1,1,54,2,2,5,117023.0,2167.09,10000.0,-5000.0,2715.06,15000.0,202576.0,3751.41,20074.0,0.0,5323.44,20074.0,0.44,6,0,1.59,6,5.0,5,5,0,23.48,20,26,6,21.3,21,22,1,144.48,141,147,6,3.41,0,6,6,0.43,0,1,1,0.0,0,0,0,0.0,0,0,0,3.44,0,23,23,26.0,2,49,47,232.67,2,1429,1427,18.0,23.0,8.0,2.0,2.0,117126.0,5092.43,20000.0,10.0,5064.12,19990.0,142090.5,6177.85,20074.0,10.0,5670.51,20064.0,13.7,45.0,6.0,14.75,39.0,5.0,5.0,5.0,0.0,23.57,20.0,27.0,7.0,21.43,21.0,22.0,1.0,144.57,141.0,148.0,7.0,2.52,0.0,6.0,6.0,0.17,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.39,0.0,5.0,5.0,23.96,7.0,50.0,43.0,167.43,9.0,341.0,332.0,2020,5,13,60,0,30


In [48]:
cols = [f for f in train_label.columns if f not in ['zhdh','black_flag']]
len(cols)

140

In [49]:
cols

['zhdh_jdbj0_dfzh_nunique',
 'zhdh_jdbj0_dfzh_count',
 'zhdh_jdbj0_dfhh_nunique',
 'zhdh_jdbj0_jyqd_nunique',
 'zhdh_jdbj0_zydh_nunique',
 'zhdh_jdbj0_jyje_sum',
 'zhdh_jdbj0_jyje_mean',
 'zhdh_jdbj0_jyje_max',
 'zhdh_jdbj0_jyje_min',
 'zhdh_jdbj0_jyje_std',
 'zhdh_jdbj0_jyje_ptp',
 'zhdh_jdbj0_zhye_sum',
 'zhdh_jdbj0_zhye_mean',
 'zhdh_jdbj0_zhye_max',
 'zhdh_jdbj0_zhye_min',
 'zhdh_jdbj0_zhye_std',
 'zhdh_jdbj0_zhye_ptp',
 'zhdh_jdbj0_dfmccd_mean',
 'zhdh_jdbj0_dfmccd_max',
 'zhdh_jdbj0_dfmccd_min',
 'zhdh_jdbj0_dfmccd_std',
 'zhdh_jdbj0_dfmccd_ptp',
 'zhdh_jdbj0_jyrq_month_mean',
 'zhdh_jdbj0_jyrq_month_min',
 'zhdh_jdbj0_jyrq_month_max',
 'zhdh_jdbj0_jyrq_month_ptp',
 'zhdh_jdbj0_jyrq_day_mean',
 'zhdh_jdbj0_jyrq_day_min',
 'zhdh_jdbj0_jyrq_day_max',
 'zhdh_jdbj0_jyrq_day_ptp',
 'zhdh_jdbj0_jyrq_weekofyear_mean',
 'zhdh_jdbj0_jyrq_weekofyear_min',
 'zhdh_jdbj0_jyrq_weekofyear_max',
 'zhdh_jdbj0_jyrq_weekofyear_ptp',
 'zhdh_jdbj0_jyrq_dayofyear_mean',
 'zhdh_jdbj0_jyrq_dayofyear_min

In [10]:
df_tmp = pd.DataFrame(data=None,columns=['zhdh','最大交易金额发生日期'])
for (name, df) in trans_info.groupby("zhdh") :
    df_tmp = df_tmp.append({'zhdh': name , "最大交易金额发生日期" : df.nlargest(1 , "jyje")["jyrq"].iloc[0] }, ignore_index=True)
df_tz_last_and_biggist = df_tmp.copy()

In [11]:
df_tmp = trans_info.groupby("zhdh")["jyrq"].max().reset_index()
df_tz_last_and_biggist = df_tz_last_and_biggist.merge(df_tmp , on="zhdh" , how="left")
df_tz_last_and_biggist.columns = ['zhdh', '最大交易金额发生日期', '最后交易发生日期']

In [None]:
df_tz_last_and_biggist

In [29]:
df_gogo = pd.DataFrame()

In [30]:
for (name, df) in trans_info.groupby("zhdh") :
    biggist_date = df_tz_last_and_biggist[df_tz_last_and_biggist["zhdh"] == name]['最大交易金额发生日期'].iloc[0]
    lastest_date = df_tz_last_and_biggist[df_tz_last_and_biggist["zhdh"] == name]['最后交易发生日期'].iloc[0]
    # # print (biggist_date)
    # # print (lastest_date)
    df_chose = df[df["jyrq"] < lastest_date]
    df_chose = df_chose[df_chose["jyrq"] > biggist_date]
    df_tmp = df_chose.groupby("zhdh")["jyje"].agg(['sum','count']).reset_index()
    # print (df_tmp)
    df_gogo = pd.concat([df_gogo , df_tmp] , ignore_index=True)
    # break 
    # # print (df_chose.head())
    # # print (df_tmp.head())
    # df_gogo = df_gogo.merge(df_tmp , on="zhdh" , how="left")

In [32]:
df_tz_last_and_biggist = df_tz_last_and_biggist.merge(df_gogo , on="zhdh" , how="left")

In [36]:
df_tz_last_and_biggist.head()

Unnamed: 0,zhdh,最大交易金额发生日期,最后交易发生日期,zdjy_to_zhjy_sum,zdjy_to_zhjy_count
0,00037295453A928A,2020-04-23,2020-05-31,220541.25,55.0
1,0004CC075464D54B,2020-03-03,2020-05-21,69132.0,22.0
2,000AA77144DC1BCC,2020-04-25,2020-05-31,102288.91,263.0
3,001ABBF3373AFC5B,2020-05-17,2020-05-24,3000.0,1.0
4,00310769938BC172,2020-05-12,2020-05-29,964600.0,8.0


In [35]:
df_tz_last_and_biggist.columns = ['zhdh', '最大交易金额发生日期', '最后交易发生日期', 'zdjy_to_zhjy_sum', 'zdjy_to_zhjy_count']

In [42]:
df_tz_last_and_biggist = df_tz_last_and_biggist.loc[: , ["zhdh" , "zdjy_to_zhjy_sum" , "zdjy_to_zhjy_count"]]

In [44]:
train_label = train_label.merge(df_tz_last_and_biggist, on=['zhdh'], how='left')
test_label  = test_label.merge(df_tz_last_and_biggist, on=['zhdh'], how='left')

In [45]:
train_label.head()

Unnamed: 0,zhdh,black_flag,zhdh_jdbj0_dfzh_nunique,zhdh_jdbj0_dfzh_count,zhdh_jdbj0_dfhh_nunique,zhdh_jdbj0_jyqd_nunique,zhdh_jdbj0_zydh_nunique,zhdh_jdbj0_jyje_sum,zhdh_jdbj0_jyje_mean,zhdh_jdbj0_jyje_max,zhdh_jdbj0_jyje_min,zhdh_jdbj0_jyje_std,zhdh_jdbj0_jyje_ptp,zhdh_jdbj0_zhye_sum,zhdh_jdbj0_zhye_mean,zhdh_jdbj0_zhye_max,zhdh_jdbj0_zhye_min,zhdh_jdbj0_zhye_std,zhdh_jdbj0_zhye_ptp,zhdh_jdbj0_dfmccd_mean,zhdh_jdbj0_dfmccd_max,zhdh_jdbj0_dfmccd_min,zhdh_jdbj0_dfmccd_std,zhdh_jdbj0_dfmccd_ptp,zhdh_jdbj0_jyrq_month_mean,zhdh_jdbj0_jyrq_month_min,zhdh_jdbj0_jyrq_month_max,zhdh_jdbj0_jyrq_month_ptp,zhdh_jdbj0_jyrq_day_mean,zhdh_jdbj0_jyrq_day_min,zhdh_jdbj0_jyrq_day_max,zhdh_jdbj0_jyrq_day_ptp,zhdh_jdbj0_jyrq_weekofyear_mean,zhdh_jdbj0_jyrq_weekofyear_min,zhdh_jdbj0_jyrq_weekofyear_max,zhdh_jdbj0_jyrq_weekofyear_ptp,zhdh_jdbj0_jyrq_dayofyear_mean,zhdh_jdbj0_jyrq_dayofyear_min,zhdh_jdbj0_jyrq_dayofyear_max,zhdh_jdbj0_jyrq_dayofyear_ptp,zhdh_jdbj0_jyrq_dayofweek_mean,zhdh_jdbj0_jyrq_dayofweek_min,zhdh_jdbj0_jyrq_dayofweek_max,zhdh_jdbj0_jyrq_dayofweek_ptp,zhdh_jdbj0_jyrq_is_wknd_mean,zhdh_jdbj0_jyrq_is_wknd_min,zhdh_jdbj0_jyrq_is_wknd_max,zhdh_jdbj0_jyrq_is_wknd_ptp,zhdh_jdbj0_jyrq_is_month_start_mean,zhdh_jdbj0_jyrq_is_month_start_min,zhdh_jdbj0_jyrq_is_month_start_max,zhdh_jdbj0_jyrq_is_month_start_ptp,zhdh_jdbj0_jyrq_is_month_end_mean,zhdh_jdbj0_jyrq_is_month_end_min,zhdh_jdbj0_jyrq_is_month_end_max,zhdh_jdbj0_jyrq_is_month_end_ptp,zhdh_jdbj0_jyrq_hour_mean,zhdh_jdbj0_jyrq_hour_min,zhdh_jdbj0_jyrq_hour_max,zhdh_jdbj0_jyrq_hour_ptp,zhdh_jdbj0_jyrq_minu_mean,zhdh_jdbj0_jyrq_minu_min,zhdh_jdbj0_jyrq_minu_max,zhdh_jdbj0_jyrq_minu_ptp,zhdh_jdbj0_jyrq_date_mean,zhdh_jdbj0_jyrq_date_min,zhdh_jdbj0_jyrq_date_max,zhdh_jdbj0_jyrq_date_ptp,zhdh_jdbj1_dfzh_nunique,zhdh_jdbj1_dfzh_count,zhdh_jdbj1_dfhh_nunique,zhdh_jdbj1_jyqd_nunique,zhdh_jdbj1_zydh_nunique,zhdh_jdbj1_jyje_sum,zhdh_jdbj1_jyje_mean,zhdh_jdbj1_jyje_max,zhdh_jdbj1_jyje_min,zhdh_jdbj1_jyje_std,zhdh_jdbj1_jyje_ptp,zhdh_jdbj1_zhye_sum,zhdh_jdbj1_zhye_mean,zhdh_jdbj1_zhye_max,zhdh_jdbj1_zhye_min,zhdh_jdbj1_zhye_std,zhdh_jdbj1_zhye_ptp,zhdh_jdbj1_dfmccd_mean,zhdh_jdbj1_dfmccd_max,zhdh_jdbj1_dfmccd_min,zhdh_jdbj1_dfmccd_std,zhdh_jdbj1_dfmccd_ptp,zhdh_jdbj1_jyrq_month_mean,zhdh_jdbj1_jyrq_month_min,zhdh_jdbj1_jyrq_month_max,zhdh_jdbj1_jyrq_month_ptp,zhdh_jdbj1_jyrq_day_mean,zhdh_jdbj1_jyrq_day_min,zhdh_jdbj1_jyrq_day_max,zhdh_jdbj1_jyrq_day_ptp,zhdh_jdbj1_jyrq_weekofyear_mean,zhdh_jdbj1_jyrq_weekofyear_min,zhdh_jdbj1_jyrq_weekofyear_max,zhdh_jdbj1_jyrq_weekofyear_ptp,zhdh_jdbj1_jyrq_dayofyear_mean,zhdh_jdbj1_jyrq_dayofyear_min,zhdh_jdbj1_jyrq_dayofyear_max,zhdh_jdbj1_jyrq_dayofyear_ptp,zhdh_jdbj1_jyrq_dayofweek_mean,zhdh_jdbj1_jyrq_dayofweek_min,zhdh_jdbj1_jyrq_dayofweek_max,zhdh_jdbj1_jyrq_dayofweek_ptp,zhdh_jdbj1_jyrq_is_wknd_mean,zhdh_jdbj1_jyrq_is_wknd_min,zhdh_jdbj1_jyrq_is_wknd_max,zhdh_jdbj1_jyrq_is_wknd_ptp,zhdh_jdbj1_jyrq_is_month_start_mean,zhdh_jdbj1_jyrq_is_month_start_min,zhdh_jdbj1_jyrq_is_month_start_max,zhdh_jdbj1_jyrq_is_month_start_ptp,zhdh_jdbj1_jyrq_is_month_end_mean,zhdh_jdbj1_jyrq_is_month_end_min,zhdh_jdbj1_jyrq_is_month_end_max,zhdh_jdbj1_jyrq_is_month_end_ptp,zhdh_jdbj1_jyrq_hour_mean,zhdh_jdbj1_jyrq_hour_min,zhdh_jdbj1_jyrq_hour_max,zhdh_jdbj1_jyrq_hour_ptp,zhdh_jdbj1_jyrq_minu_mean,zhdh_jdbj1_jyrq_minu_min,zhdh_jdbj1_jyrq_minu_max,zhdh_jdbj1_jyrq_minu_ptp,zhdh_jdbj1_jyrq_date_mean,zhdh_jdbj1_jyrq_date_min,zhdh_jdbj1_jyrq_date_max,zhdh_jdbj1_jyrq_date_ptp,year,month,day,khjgdh,xb,年龄,zdjy_to_zhjy_sum,zdjy_to_zhjy_count
0,2029FF26D4E2CA79,0,2,15,2,2,2,212093.3,14139.55,116724.0,500.0,28937.85,116224.0,9201259.08,613417.27,735585.39,418851.49,117111.85,316733.9,4.8,6,0,2.48,6,4.27,3,5,2,13.67,1,29,28,16.8,11,22,11,112.53,71,149,78,2.93,0,6,6,0.2,0,1,1,0.07,0,1,1,0.0,0,0,0,12.6,3,21,18,29.4,2,56,54,785.4,233,1314,1081,2.0,13.0,2.0,2.0,2.0,566358.37,43566.03,120000.0,1288.14,48893.58,118711.86,7668300.93,589869.3,755477.54,365859.88,143758.73,389617.66,4.62,6.0,0.0,2.63,6.0,4.08,3.0,5.0,2.0,7.54,1.0,26.0,25.0,15.0,9.0,22.0,13.0,100.54,61.0,147.0,86.0,3.54,0.0,6.0,6.0,0.31,0.0,1.0,1.0,0.31,0.0,1.0,1.0,0.0,0.0,0.0,0.0,12.08,0.0,20.0,20.0,36.0,4.0,53.0,49.0,760.62,34.0,1204.0,1170.0,2018,3,28,42,1,40,431956.3,15.0
1,09F5B90D46FB7CCE,1,12,31,11,5,7,634809.54,20477.73,100000.0,-100000.0,38779.84,200000.0,362731.96,11701.03,116299.44,0.19,25148.07,116299.25,10.26,45,0,8.82,45,4.23,3,5,2,18.45,1,30,29,17.23,12,20,8,116.16,77,137,60,3.58,1,6,5,0.16,0,1,1,0.16,0,1,1,0.26,0,1,1,11.29,3,16,13,33.0,0,59,59,710.42,185,978,793,84.0,98.0,17.0,4.0,4.0,638908.0,6519.47,52500.0,100.0,9018.45,52400.0,4819268.57,49176.21,173292.44,340.44,45662.47,172952.0,9.7,33.0,6.0,5.55,27.0,4.27,3.0,5.0,2.0,18.06,1.0,30.0,29.0,17.38,12.0,18.0,6.0,117.01,80.0,122.0,42.0,3.37,2.0,5.0,3.0,0.18,0.0,1.0,1.0,0.28,0.0,1.0,1.0,0.2,0.0,1.0,1.0,12.1,9.0,16.0,7.0,26.93,0.0,57.0,57.0,753.05,542.0,976.0,434.0,2018,2,27,155,0,39,1028825.29,83.0
2,E0C880EB18F4EFE8,1,7,23,6,5,5,1083609.1,47113.44,210000.0,0.1,66230.8,209999.9,3705.62,161.11,658.5,0.91,245.15,657.59,25.43,30,9,7.72,21,4.87,3,5,2,11.39,9,19,10,19.3,12,20,8,128.43,78,132,54,1.3,0,6,6,0.22,0,1,1,0.0,0,0,0,0.0,0,0,0,9.13,0,18,18,29.09,1,55,54,576.91,19,1111,1092,13.0,16.0,9.0,2.0,2.0,1082972.0,67685.75,200000.0,1.0,61249.93,199999.0,1234410.56,77150.66,210062.91,22.91,70690.05,210040.0,11.44,33.0,6.0,8.5,27.0,5.0,5.0,5.0,0.0,11.06,9.0,14.0,5.0,19.94,19.0,20.0,1.0,132.06,130.0,135.0,5.0,0.5,0.0,5.0,5.0,0.06,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.88,9.0,18.0,9.0,30.06,0.0,59.0,59.0,622.56,545.0,1111.0,566.0,2019,5,25,374,0,24,,
3,ED8FFDCCF93C2F11,1,4,9,2,2,5,36105.5,4011.72,24400.0,-2.0,7913.63,24402.0,3439.75,382.19,3140.75,4.75,1035.57,3136.0,12.0,45,0,18.91,45,4.67,4,5,1,12.11,3,30,27,18.11,18,19,1,123.11,121,125,4,4.33,0,6,6,0.56,0,1,1,0.0,0,0,0,0.33,0,1,1,15.89,0,22,22,14.0,0,53,53,967.33,3,1339,1336,28.0,36.0,11.0,3.0,3.0,36237.0,1006.58,6000.0,10.0,1459.22,5990.0,347005.5,9639.04,24431.75,10.25,7753.12,24421.5,8.25,12.0,6.0,1.66,6.0,4.97,4.0,5.0,1.0,3.78,3.0,30.0,27.0,18.03,18.0,19.0,1.0,123.94,121.0,125.0,4.0,5.75,0.0,6.0,6.0,0.94,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.03,0.0,1.0,1.0,21.39,10.0,23.0,13.0,39.06,5.0,58.0,53.0,1322.39,654.0,1391.0,737.0,2017,1,17,0,0,30,,
4,8D2F48B0BFC40AB2,1,1,54,2,2,5,117023.0,2167.09,10000.0,-5000.0,2715.06,15000.0,202576.0,3751.41,20074.0,0.0,5323.44,20074.0,0.44,6,0,1.59,6,5.0,5,5,0,23.48,20,26,6,21.3,21,22,1,144.48,141,147,6,3.41,0,6,6,0.43,0,1,1,0.0,0,0,0,0.0,0,0,0,3.44,0,23,23,26.0,2,49,47,232.67,2,1429,1427,18.0,23.0,8.0,2.0,2.0,117126.0,5092.43,20000.0,10.0,5064.12,19990.0,142090.5,6177.85,20074.0,10.0,5670.51,20064.0,13.7,45.0,6.0,14.75,39.0,5.0,5.0,5.0,0.0,23.57,20.0,27.0,7.0,21.43,21.0,22.0,1.0,144.57,141.0,148.0,7.0,2.52,0.0,6.0,6.0,0.17,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.39,0.0,5.0,5.0,23.96,7.0,50.0,43.0,167.43,9.0,341.0,332.0,2020,5,13,60,0,30,118356.5,38.0


In [46]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2023
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    oof = np.zeros(train_x.shape[0])
    predict = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2020,
                'n_jobs':8
            }

            model = clf.train(params, train_matrix, 10000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=200, early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.05,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 8
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=10000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=500)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            
            model = clf(
                        n_estimators=10000,
                        random_seed=1024,
                        eval_metric='AUC',
                        learning_rate=0.05,
                        max_depth=5,
                        early_stopping_rounds=200,
                        metric_period=500,
                    )

            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True,
                      verbose=1)
            
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
            
        oof[valid_index] = val_pred
        predict += test_pred / kf.n_splits
        
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
       
    return oof, predict

In [47]:
train_label.head()

Unnamed: 0,zhdh,black_flag,zhdh_jdbj0_dfzh_nunique,zhdh_jdbj0_dfzh_count,zhdh_jdbj0_dfhh_nunique,zhdh_jdbj0_jyqd_nunique,zhdh_jdbj0_zydh_nunique,zhdh_jdbj0_jyje_sum,zhdh_jdbj0_jyje_mean,zhdh_jdbj0_jyje_max,zhdh_jdbj0_jyje_min,zhdh_jdbj0_jyje_std,zhdh_jdbj0_jyje_ptp,zhdh_jdbj0_zhye_sum,zhdh_jdbj0_zhye_mean,zhdh_jdbj0_zhye_max,zhdh_jdbj0_zhye_min,zhdh_jdbj0_zhye_std,zhdh_jdbj0_zhye_ptp,zhdh_jdbj0_dfmccd_mean,zhdh_jdbj0_dfmccd_max,zhdh_jdbj0_dfmccd_min,zhdh_jdbj0_dfmccd_std,zhdh_jdbj0_dfmccd_ptp,zhdh_jdbj0_jyrq_month_mean,zhdh_jdbj0_jyrq_month_min,zhdh_jdbj0_jyrq_month_max,zhdh_jdbj0_jyrq_month_ptp,zhdh_jdbj0_jyrq_day_mean,zhdh_jdbj0_jyrq_day_min,zhdh_jdbj0_jyrq_day_max,zhdh_jdbj0_jyrq_day_ptp,zhdh_jdbj0_jyrq_weekofyear_mean,zhdh_jdbj0_jyrq_weekofyear_min,zhdh_jdbj0_jyrq_weekofyear_max,zhdh_jdbj0_jyrq_weekofyear_ptp,zhdh_jdbj0_jyrq_dayofyear_mean,zhdh_jdbj0_jyrq_dayofyear_min,zhdh_jdbj0_jyrq_dayofyear_max,zhdh_jdbj0_jyrq_dayofyear_ptp,zhdh_jdbj0_jyrq_dayofweek_mean,zhdh_jdbj0_jyrq_dayofweek_min,zhdh_jdbj0_jyrq_dayofweek_max,zhdh_jdbj0_jyrq_dayofweek_ptp,zhdh_jdbj0_jyrq_is_wknd_mean,zhdh_jdbj0_jyrq_is_wknd_min,zhdh_jdbj0_jyrq_is_wknd_max,zhdh_jdbj0_jyrq_is_wknd_ptp,zhdh_jdbj0_jyrq_is_month_start_mean,zhdh_jdbj0_jyrq_is_month_start_min,zhdh_jdbj0_jyrq_is_month_start_max,zhdh_jdbj0_jyrq_is_month_start_ptp,zhdh_jdbj0_jyrq_is_month_end_mean,zhdh_jdbj0_jyrq_is_month_end_min,zhdh_jdbj0_jyrq_is_month_end_max,zhdh_jdbj0_jyrq_is_month_end_ptp,zhdh_jdbj0_jyrq_hour_mean,zhdh_jdbj0_jyrq_hour_min,zhdh_jdbj0_jyrq_hour_max,zhdh_jdbj0_jyrq_hour_ptp,zhdh_jdbj0_jyrq_minu_mean,zhdh_jdbj0_jyrq_minu_min,zhdh_jdbj0_jyrq_minu_max,zhdh_jdbj0_jyrq_minu_ptp,zhdh_jdbj0_jyrq_date_mean,zhdh_jdbj0_jyrq_date_min,zhdh_jdbj0_jyrq_date_max,zhdh_jdbj0_jyrq_date_ptp,zhdh_jdbj1_dfzh_nunique,zhdh_jdbj1_dfzh_count,zhdh_jdbj1_dfhh_nunique,zhdh_jdbj1_jyqd_nunique,zhdh_jdbj1_zydh_nunique,zhdh_jdbj1_jyje_sum,zhdh_jdbj1_jyje_mean,zhdh_jdbj1_jyje_max,zhdh_jdbj1_jyje_min,zhdh_jdbj1_jyje_std,zhdh_jdbj1_jyje_ptp,zhdh_jdbj1_zhye_sum,zhdh_jdbj1_zhye_mean,zhdh_jdbj1_zhye_max,zhdh_jdbj1_zhye_min,zhdh_jdbj1_zhye_std,zhdh_jdbj1_zhye_ptp,zhdh_jdbj1_dfmccd_mean,zhdh_jdbj1_dfmccd_max,zhdh_jdbj1_dfmccd_min,zhdh_jdbj1_dfmccd_std,zhdh_jdbj1_dfmccd_ptp,zhdh_jdbj1_jyrq_month_mean,zhdh_jdbj1_jyrq_month_min,zhdh_jdbj1_jyrq_month_max,zhdh_jdbj1_jyrq_month_ptp,zhdh_jdbj1_jyrq_day_mean,zhdh_jdbj1_jyrq_day_min,zhdh_jdbj1_jyrq_day_max,zhdh_jdbj1_jyrq_day_ptp,zhdh_jdbj1_jyrq_weekofyear_mean,zhdh_jdbj1_jyrq_weekofyear_min,zhdh_jdbj1_jyrq_weekofyear_max,zhdh_jdbj1_jyrq_weekofyear_ptp,zhdh_jdbj1_jyrq_dayofyear_mean,zhdh_jdbj1_jyrq_dayofyear_min,zhdh_jdbj1_jyrq_dayofyear_max,zhdh_jdbj1_jyrq_dayofyear_ptp,zhdh_jdbj1_jyrq_dayofweek_mean,zhdh_jdbj1_jyrq_dayofweek_min,zhdh_jdbj1_jyrq_dayofweek_max,zhdh_jdbj1_jyrq_dayofweek_ptp,zhdh_jdbj1_jyrq_is_wknd_mean,zhdh_jdbj1_jyrq_is_wknd_min,zhdh_jdbj1_jyrq_is_wknd_max,zhdh_jdbj1_jyrq_is_wknd_ptp,zhdh_jdbj1_jyrq_is_month_start_mean,zhdh_jdbj1_jyrq_is_month_start_min,zhdh_jdbj1_jyrq_is_month_start_max,zhdh_jdbj1_jyrq_is_month_start_ptp,zhdh_jdbj1_jyrq_is_month_end_mean,zhdh_jdbj1_jyrq_is_month_end_min,zhdh_jdbj1_jyrq_is_month_end_max,zhdh_jdbj1_jyrq_is_month_end_ptp,zhdh_jdbj1_jyrq_hour_mean,zhdh_jdbj1_jyrq_hour_min,zhdh_jdbj1_jyrq_hour_max,zhdh_jdbj1_jyrq_hour_ptp,zhdh_jdbj1_jyrq_minu_mean,zhdh_jdbj1_jyrq_minu_min,zhdh_jdbj1_jyrq_minu_max,zhdh_jdbj1_jyrq_minu_ptp,zhdh_jdbj1_jyrq_date_mean,zhdh_jdbj1_jyrq_date_min,zhdh_jdbj1_jyrq_date_max,zhdh_jdbj1_jyrq_date_ptp,year,month,day,khjgdh,xb,年龄,zdjy_to_zhjy_sum,zdjy_to_zhjy_count
0,2029FF26D4E2CA79,0,2,15,2,2,2,212093.3,14139.55,116724.0,500.0,28937.85,116224.0,9201259.08,613417.27,735585.39,418851.49,117111.85,316733.9,4.8,6,0,2.48,6,4.27,3,5,2,13.67,1,29,28,16.8,11,22,11,112.53,71,149,78,2.93,0,6,6,0.2,0,1,1,0.07,0,1,1,0.0,0,0,0,12.6,3,21,18,29.4,2,56,54,785.4,233,1314,1081,2.0,13.0,2.0,2.0,2.0,566358.37,43566.03,120000.0,1288.14,48893.58,118711.86,7668300.93,589869.3,755477.54,365859.88,143758.73,389617.66,4.62,6.0,0.0,2.63,6.0,4.08,3.0,5.0,2.0,7.54,1.0,26.0,25.0,15.0,9.0,22.0,13.0,100.54,61.0,147.0,86.0,3.54,0.0,6.0,6.0,0.31,0.0,1.0,1.0,0.31,0.0,1.0,1.0,0.0,0.0,0.0,0.0,12.08,0.0,20.0,20.0,36.0,4.0,53.0,49.0,760.62,34.0,1204.0,1170.0,2018,3,28,42,1,40,431956.3,15.0
1,09F5B90D46FB7CCE,1,12,31,11,5,7,634809.54,20477.73,100000.0,-100000.0,38779.84,200000.0,362731.96,11701.03,116299.44,0.19,25148.07,116299.25,10.26,45,0,8.82,45,4.23,3,5,2,18.45,1,30,29,17.23,12,20,8,116.16,77,137,60,3.58,1,6,5,0.16,0,1,1,0.16,0,1,1,0.26,0,1,1,11.29,3,16,13,33.0,0,59,59,710.42,185,978,793,84.0,98.0,17.0,4.0,4.0,638908.0,6519.47,52500.0,100.0,9018.45,52400.0,4819268.57,49176.21,173292.44,340.44,45662.47,172952.0,9.7,33.0,6.0,5.55,27.0,4.27,3.0,5.0,2.0,18.06,1.0,30.0,29.0,17.38,12.0,18.0,6.0,117.01,80.0,122.0,42.0,3.37,2.0,5.0,3.0,0.18,0.0,1.0,1.0,0.28,0.0,1.0,1.0,0.2,0.0,1.0,1.0,12.1,9.0,16.0,7.0,26.93,0.0,57.0,57.0,753.05,542.0,976.0,434.0,2018,2,27,155,0,39,1028825.29,83.0
2,E0C880EB18F4EFE8,1,7,23,6,5,5,1083609.1,47113.44,210000.0,0.1,66230.8,209999.9,3705.62,161.11,658.5,0.91,245.15,657.59,25.43,30,9,7.72,21,4.87,3,5,2,11.39,9,19,10,19.3,12,20,8,128.43,78,132,54,1.3,0,6,6,0.22,0,1,1,0.0,0,0,0,0.0,0,0,0,9.13,0,18,18,29.09,1,55,54,576.91,19,1111,1092,13.0,16.0,9.0,2.0,2.0,1082972.0,67685.75,200000.0,1.0,61249.93,199999.0,1234410.56,77150.66,210062.91,22.91,70690.05,210040.0,11.44,33.0,6.0,8.5,27.0,5.0,5.0,5.0,0.0,11.06,9.0,14.0,5.0,19.94,19.0,20.0,1.0,132.06,130.0,135.0,5.0,0.5,0.0,5.0,5.0,0.06,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.88,9.0,18.0,9.0,30.06,0.0,59.0,59.0,622.56,545.0,1111.0,566.0,2019,5,25,374,0,24,,
3,ED8FFDCCF93C2F11,1,4,9,2,2,5,36105.5,4011.72,24400.0,-2.0,7913.63,24402.0,3439.75,382.19,3140.75,4.75,1035.57,3136.0,12.0,45,0,18.91,45,4.67,4,5,1,12.11,3,30,27,18.11,18,19,1,123.11,121,125,4,4.33,0,6,6,0.56,0,1,1,0.0,0,0,0,0.33,0,1,1,15.89,0,22,22,14.0,0,53,53,967.33,3,1339,1336,28.0,36.0,11.0,3.0,3.0,36237.0,1006.58,6000.0,10.0,1459.22,5990.0,347005.5,9639.04,24431.75,10.25,7753.12,24421.5,8.25,12.0,6.0,1.66,6.0,4.97,4.0,5.0,1.0,3.78,3.0,30.0,27.0,18.03,18.0,19.0,1.0,123.94,121.0,125.0,4.0,5.75,0.0,6.0,6.0,0.94,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.03,0.0,1.0,1.0,21.39,10.0,23.0,13.0,39.06,5.0,58.0,53.0,1322.39,654.0,1391.0,737.0,2017,1,17,0,0,30,,
4,8D2F48B0BFC40AB2,1,1,54,2,2,5,117023.0,2167.09,10000.0,-5000.0,2715.06,15000.0,202576.0,3751.41,20074.0,0.0,5323.44,20074.0,0.44,6,0,1.59,6,5.0,5,5,0,23.48,20,26,6,21.3,21,22,1,144.48,141,147,6,3.41,0,6,6,0.43,0,1,1,0.0,0,0,0,0.0,0,0,0,3.44,0,23,23,26.0,2,49,47,232.67,2,1429,1427,18.0,23.0,8.0,2.0,2.0,117126.0,5092.43,20000.0,10.0,5064.12,19990.0,142090.5,6177.85,20074.0,10.0,5670.51,20064.0,13.7,45.0,6.0,14.75,39.0,5.0,5.0,5.0,0.0,23.57,20.0,27.0,7.0,21.43,21.0,22.0,1.0,144.57,141.0,148.0,7.0,2.52,0.0,6.0,6.0,0.17,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.39,0.0,5.0,5.0,23.96,7.0,50.0,43.0,167.43,9.0,341.0,332.0,2020,5,13,60,0,30,118356.5,38.0


In [50]:
xgb_oof, xgb_pred = cv_model(xgb, train_label[cols], train_label['black_flag'], test_label[cols], 'xgb')

************************************ 1 ************************************
[0]	train-auc:0.87879	eval-auc:0.89954
[652]	train-auc:0.99968	eval-auc:0.96519
[0.9702777777777777]
************************************ 2 ************************************
[0]	train-auc:0.90186	eval-auc:0.85072
[751]	train-auc:0.99977	eval-auc:0.95397
[0.9702777777777777, 0.9558716926422617]
************************************ 3 ************************************
[0]	train-auc:0.89520	eval-auc:0.84762
[950]	train-auc:0.99989	eval-auc:0.95695
[0.9702777777777777, 0.9558716926422617, 0.9581326781326781]
************************************ 4 ************************************
[0]	train-auc:0.90822	eval-auc:0.91745
[596]	train-auc:0.99962	eval-auc:0.96180
[0.9702777777777777, 0.9558716926422617, 0.9581326781326781, 0.9702268854811227]
************************************ 5 ************************************
[0]	train-auc:0.93628	eval-auc:0.89014
[615]	train-auc:0.99980	eval-auc:0.95194
[0.9702777777777

In [51]:
oof = xgb_oof
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train_label['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')
# 0.47, 0.9150898680694286 # 0.86579572447
# 0.43, 0.9217716422203048 # 0.86697783
# 0.41, 0.9198568108353592 # 0.87674418605
# 0.40, 0.9231997065541027 # 0.87819025522
# 0.42, 0.913822737200522  # 0.87639132982 
# 0.40, 0.9148403872302214 # 0.88313184

0.40, 0.9022356953391437
0.41, 0.9032587572164699
0.42, 0.90589279429542
0.43, 0.907966637906241
0.44, 0.9077434036533613
0.45, 0.9075183710902428
0.46, 0.9070628371392393
0.47, 0.9091614906832297
0.48, 0.9091614906832297
0.49, 0.9078823731755026
0.50, 0.9063655799872032
0.51, 0.9063655799872032
0.52, 0.9071853971691546
0.53, 0.9082439299830604
0.54, 0.9056511056511056
0.55, 0.9043494433668996
0.56, 0.9054090271318441
0.57, 0.907535829865927
0.58, 0.907535829865927
0.59, 0.9049188640973631
0.47, 0.9091614906832297


In [52]:
pred = xgb_pred
test_label['black_flag'] = (pred.reshape((-1))>best_threshold).astype('int')

In [53]:
test_label[['zhdh','black_flag']].to_csv('submission.csv', index=False)

In [54]:
test_label['black_flag'].mean(), train_label['black_flag'].mean()

(0.22145833333333334, 0.25)