In [12]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import tqdm
import sys
import os
import gc
import argparse
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
warnings.filterwarnings('ignore')

In [13]:
trans_info = pd.read_csv('./data/账户交易信息.csv')
static_info = pd.read_csv('./data/账户静态信息.csv')

train_label = pd.read_csv('./data/训练集标签.csv')
test_label = pd.read_csv('./data/test_dataset.csv')

In [14]:
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[col] = pd.to_datetime(df_copy[col], format='%Y-%m-%d')
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofyear'] = df_copy[col].dt.dayofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    df_copy[prefix + 'hour'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0]))
    df_copy[prefix + 'minu'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[1]))
    df_copy[prefix + 'date'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    
    return df_copy   

trans_info = get_time_feature(trans_info, "jyrq")
time_cols = [f for f in trans_info.columns if 'jyrq_' in f]
print(time_cols)
print(train_label.shape, test_label.shape)

['jyrq_month', 'jyrq_day', 'jyrq_weekofyear', 'jyrq_dayofyear', 'jyrq_dayofweek', 'jyrq_is_wknd', 'jyrq_is_month_start', 'jyrq_is_month_end', 'jyrq_hour', 'jyrq_minu', 'jyrq_date']
(1200, 2) (4800, 1)


In [15]:
%%time
def get_base_feat(df1_, df2_):
    df1 = df1_.copy() # 构建特征数据 
    df2 = df2_.copy()

    agg_func = {
        'dfzh': ['nunique','count'],
        'dfhh': ['nunique'],
        'jyqd': ['nunique'],
        'zydh': ['nunique'],
        'jyje': ['sum','mean','max','min','std',np.ptp],
        'zhye': ['sum','mean','max','min','std',np.ptp],
        'dfmccd': ['mean','max','min','std',np.ptp],
    }
    
    for col in time_cols:
        agg_func[col] = ['mean','min','max',np.ptp]
    
    agg_df = df1[df1['jdbj']==0].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj0_' + '_'.join(f).strip() for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    agg_df = df1[df1['jdbj']==1].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj1_' + '_'.join(f).strip() for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    return df2

train_label = get_base_feat(trans_info, train_label)
test_label = get_base_feat(trans_info, test_label)

Wall time: 56 s


In [16]:
%%time
# 合并账户静态信息
static_info['khrq']  = pd.to_datetime(static_info['khrq'], format='%Y-%m-%d')
static_info['year']  = static_info['khrq'].dt.year
static_info['month'] = static_info['khrq'].dt.month
static_info['day']   = static_info['khrq'].dt.day

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in ['khjgdh']:
    static_info[col] = label_encode(static_info[col])

keep_cols = ['zhdh','year','month','day','khjgdh','xb','年龄']

train_label = train_label.merge(static_info[keep_cols], on=['zhdh'], how='left')
test_label  = test_label.merge(static_info[keep_cols], on=['zhdh'], how='left')

Wall time: 33.9 ms


In [17]:
cols = [f for f in train_label.columns if f not in ['zhdh','black_flag']]
len(cols)

138

In [18]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2023
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    oof = np.zeros(train_x.shape[0])
    predict = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2020,
                'n_jobs':8
            }

            model = clf.train(params, train_matrix, 10000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=200, early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.05,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 8
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=10000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=500)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            
            model = clf(
                        n_estimators=10000,
                        random_seed=1024,
                        eval_metric='AUC',
                        learning_rate=0.05,
                        max_depth=5,
                        early_stopping_rounds=200,
                        metric_period=500,
                    )

            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True,
                      verbose=1)
            
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
            
        oof[valid_index] = val_pred
        predict += test_pred / kf.n_splits
        
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
       
    return oof, predict

In [19]:
xgb_oof, xgb_pred = cv_model(xgb, train_label[cols], train_label['black_flag'], test_label[cols], 'xgb')

************************************ 1 ************************************
[0]	train-auc:0.87682	eval-auc:0.86880
[532]	train-auc:0.99962	eval-auc:0.95898
[0.9649074074074074]
************************************ 2 ************************************
[0]	train-auc:0.91329	eval-auc:0.87092
[709]	train-auc:0.99966	eval-auc:0.95632
[0.9649074074074074, 0.9591337441101848]
************************************ 3 ************************************
[0]	train-auc:0.91741	eval-auc:0.84973
[793]	train-auc:0.99981	eval-auc:0.94870
[0.9649074074074074, 0.9591337441101848, 0.9510565110565111]
************************************ 4 ************************************
[0]	train-auc:0.91198	eval-auc:0.90292
[610]	train-auc:0.99966	eval-auc:0.96458
[0.9649074074074074, 0.9591337441101848, 0.9510565110565111, 0.9724688368756165]
************************************ 5 ************************************
[0]	train-auc:0.90593	eval-auc:0.86213
[572]	train-auc:0.99970	eval-auc:0.94472
[0.9649074074074

In [20]:
oof = xgb_oof
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train_label['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')
# 0.47, 0.9150898680694286 # 0.86579572447
# 0.43, 0.9217716422203048 # 0.86697783
# 0.41, 0.9198568108353592 # 0.87674418605
# 0.40, 0.9231997065541027 # 0.87819025522
# 0.42, 0.913822737200522  # 0.87639132982 
# 0.40, 0.9148403872302214 # 0.88313184

0.40, 0.9012148491870569
0.41, 0.9030299342790133
0.42, 0.9046304958352445
0.43, 0.9054369887446954
0.44, 0.904167280053805
0.45, 0.9052063303674989
0.46, 0.9011320381630332
0.47, 0.8998447204968945
0.48, 0.902976082045607
0.49, 0.902976082045607
0.50, 0.9037825956013783
0.51, 0.904835660388627
0.52, 0.9056511056511056
0.53, 0.9043494433668996
0.54, 0.9041033316199533
0.55, 0.8988447475135146
0.56, 0.8961936684103612
0.57, 0.8961936684103612
0.58, 0.8959204609008862
0.59, 0.8969809015369012
0.52, 0.9056511056511056


In [21]:
pred = xgb_pred
test_label['black_flag'] = (pred.reshape((-1))>best_threshold).astype('int')

In [22]:
test_label[['zhdh','black_flag']].to_csv('submission.csv', index=False)

In [23]:
test_label['black_flag'].mean(), train_label['black_flag'].mean()

(0.21104166666666666, 0.25)