In [19]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import tqdm
import sys
import os
import gc
import argparse
import warnings
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import pickle
#计算shap值
import shap
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
import math

In [20]:
pd.set_option('display.max_rows', None)  # 显示所有行
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('expand_frame_repr', False)  # 即“禁止换行”
# pd.set_option('display.precision', 2) #展示两位小数点
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [21]:
df_data_交易=pd.read_csv('data/账户交易信息.csv')
df_data_交易.columns = ['交易流水序号','账户代号','对方账号','借贷标志','交易金额','交易余额','对方行号','交易日期','交易时间','交易渠道','摘要代号','对方名称长度']
df_data_账户=pd.read_csv('data/账户静态信息.csv')
df_data_账户.columns = ['账户代号','开户日期','开户行代号','客户性别','年龄']
df_label_train=pd.read_csv('data/训练集标签.csv')
df_label_train.columns = ['账户代号','label']
df_label_test=pd.read_csv('data/test_dataset.csv')
df_label_test.columns = ['账户代号']## 读取数据
df_label_all=pd.concat([df_label_train,df_label_test])

### 特征提取

#### 抹账处理

In [22]:
df_tz = df_label_all.copy() ;

In [23]:
list_交易_金额为负_index = list(df_data_交易[df_data_交易["交易金额"] < 0].index)
list_mz_index = [] ;
list_交易_金额为负_对应的失败交易_index = [] ;
for i in list_交易_金额为负_index :
    S_now = df_data_交易.loc[i] ;
    j = i ;
    while (True) :
        j -= 1 ;
        S_nex = df_data_交易.loc[j] ;
        if (S_now["交易金额"] == -S_nex["交易金额"]) :
            if (S_now["账户代号"] == S_nex["账户代号"]) :
                list_交易_金额为负_对应的失败交易_index.append(j) ;
                list_mz_index.append(i) ;
                break ;
list_交易_抹账需要去除_index = list_交易_金额为负_index + list_交易_金额为负_对应的失败交易_index ;
df_data_交易_去除抹账后=df_data_交易[~df_data_交易.index.isin(list_交易_抹账需要去除_index)]

In [24]:
# 取出抹账的数据
df_data_抹账_账户代号与交易金额 = df_data_交易.loc[list_交易_金额为负_对应的失败交易_index , ['账户代号' , '交易金额']] ;
df_tz_抹账 = df_data_抹账_账户代号与交易金额.groupby('账户代号')['交易金额'].agg(['sum','count','mean','max','min','std',np.ptp]).reset_index() ;
df_tz_抹账.columns = ['账户代号'] + ['抹账_'+ f for f in df_tz_抹账.columns.values if f not in ['账户代号']]

In [25]:
df_tz=df_tz.merge(df_tz_抹账,on='账户代号',how='left') ;

#### 交易渠道

In [67]:
# 获取客户的交易总数
df_tz_交易总数 = df_data_交易_去除抹账后.groupby("账户代号")[["账户代号"]].agg(["count"]).reset_index()

In [62]:
type(df_tz_交易总数)

pandas.core.frame.DataFrame

In [63]:
df_tz_交易总数.columns

MultiIndex([('账户代号', 'count')],
           )

In [71]:
df_tz_交易总数.sort_values("账户代号")

ValueError: The column label '账户代号' is not unique.
For a multi-index, the label must be a tuple with elements corresponding to each level.

In [None]:

# 每个客户针对每个交易渠道，去取有多少;
for i in df_data_交易_去除抹账后["交易渠道"].unique() :
    df_now = df_data_交易_去除抹账后[(df_data_交易_去除抹账后["交易渠道"] == i)].groupby("账户代号")["交易金额","交易余额"].agg(["count",'sum','mean','max','min','std']).reset_index() ;
    df_now.columns = ["账户代号"] + ["渠道代号_" + i + "_" + "_".join(f) for f in df_now.columns if f[0] != "账户代号"]
    df_mid = df_mid.merge(df_now , how="left") ;

In [None]:
df_mid.head(100)

### 特征筛选、训练

In [None]:
df_all = df_mid

In [None]:
train_label = df_all[:len(df_label_train)] ;
test_label = df_all[len(df_label_train):] ;

In [None]:
def corr_filter(train_data,corr_threshold):
    corr_data=train_data.corr()
    corr_data=corr_data.where(np.triu(np.ones(corr_data.shape),k=1).astype(np.bool))
    high_corr=[column for column in corr_data.columns if any(corr_data[column].abs()>corr_threshold)]
    result=[c for c in train_data.columns if c not in high_corr]
    print("筛选后特征:",len(result))
    return result

In [None]:
cols = [f for f in train_label.columns if f not in ['账户代号','label']]

In [None]:
cols

In [None]:
cols =corr_filter(train_label[cols],0.95)

In [None]:
def 对抗验证获取数据(df_train,df_test,threshold):
    # 定义新的Y
    
    
    df_train['Is_Test'] = 0
    df_test['Is_Test'] = 1
    #print(df_train.columns)
    #print(df_test.columns)
    # 将 Train 和 Test 合成一个数据集。
    df_adv = pd.concat([df_train, df_test])
    #print(df_adv.columns)
    #catecol_list_index=[list(df_train.columns).index(c) for c in df_train.select_dtypes(include=['category']).columns]
    # 通过抗验证中的模型，得到各个样本属于测试集的概率


    
    model_adv =  lgb.LGBMClassifier()
    model_adv.fit(df_adv.drop(['label','Is_Test'], axis=1), df_adv.loc[:, 'Is_Test'])
    preds_adv = model_adv.predict_proba(df_adv.drop(['label','Is_Test'], axis=1))[:, 1]
    
    df_train_copy = df_train.copy()
    df_train_copy['is_test_prob'] = preds_adv[:len(df_train)]

    # 根据概率排序
    df_train_copy = df_train_copy.sort_values('is_test_prob').reset_index(drop=True)

    # 将概率最大的20%作为验证集
    df_validation_2 = df_train_copy.iloc[int(threshold * len(df_train)):, ]
    df_train_2 = df_train_copy.iloc[:int(threshold * len(df_train)), ]
    return df_validation_2.drop(['is_test_prob','Is_Test'], axis=1)#,df_train_copy.drop(['Is_Test'],axis=1)
def 对抗验证获得新的列(df_train,df_test,threshold):
    # 定义新的Y
    df_train['Is_Test'] = 0
    df_test['Is_Test'] = 1

    # 将 Train 和 Test 合成一个数据集。
    df_adv = pd.concat([df_train, df_test])

    # 通过抗验证中的模型，得到各个样本属于测试集的概率
    other_param={'boosting_type':'gbdt','num_leaves':32,'max_depth':10,'n_estimators':200,'objective':'binary','subsample':0.7,'colsample_bytree':0.8,'subsample_freq':1,
               'min_child_weight':0.9,'learning_rate':0.08}
    model_adv =  lgb.LGBMClassifier(**other_param)
    model_adv.fit(df_adv.drop('Is_Test', axis=1), df_adv.loc[:, 'Is_Test'])
    #preds_adv = model_adv.predict_proba(df_adv.drop('Is_Test', axis=1))[:, 1]
    
    im=pd.DataFrame({'colname':df_train.drop(['Is_Test'],axis=1).columns,'importance':model_adv.feature_importances_})

    im=im.sort_values(by='importance',ascending=False)
        
    new_cat=im.reset_index().loc[round(len(im)*threshold):,'colname']
    print('对抗验证后的特征数量为:',len(new_cat))
    return new_cat

In [None]:
new_cat=对抗验证获得新的列(train_label[cols].copy(),test_label[cols].copy(),0.6)
cols=list(new_cat)
col_valid=cols.copy()
col_valid.append('label')
valid=对抗验证获取数据(train_label[col_valid].copy(),test_label[cols].copy(),0.8)

In [None]:
# new_cat=对抗验证获得新的列(train_label[cols].copy(),test_label[cols].copy(),0)
# cols=list(new_cat)
# col_valid=cols.copy()
# col_valid.append('label')
# valid=对抗验证获取数据(train_label[col_valid].copy(),test_label[cols].copy(),0)

In [None]:
cols

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name,valid_x):
    folds = 5
    seed = 2023
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    
    oof = np.zeros(train_x.shape[0])
    predict = np.zeros(test_x.shape[0])
    predict_valid = np.zeros(valid_x.shape[0])
    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.01,
                'seed': 2020,
                'n_jobs':8
            }

            model = clf.train(params, train_matrix, 10000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=200, early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.05,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 8
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=10000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=500)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
            
            valid_dk_matrix = clf.DMatrix(valid_x)
            valid_pred= model.predict(valid_dk_matrix, ntree_limit=model.best_ntree_limit)
        if clf_name == "cat":
            model = clf(
                        n_estimators=10000,
                        random_seed=2023,
                        eval_metric='AUC',
                        learning_rate=0.05,
                        max_depth=7,
                        early_stopping_rounds=200,
                        metric_period=500,
                    )

            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True,
                      verbose=1)
            
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
            valid_pred= model.predict_proba(valid_x)[:,1]
            
        oof[valid_index] = val_pred
        predict += test_pred / kf.n_splits
        predict_valid +=valid_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
       
    return oof, predict,predict_valid,model

In [None]:
xgb_oof, xgb_pred,xgb_valid_pred,clf = cv_model(CatBoostClassifier, train_label[cols], train_label['label'], test_label[cols], 'cat',valid[cols])
#xgb_oof, xgb_pred,xgb_valid_pred,clf = cv_model(xgb, train_label[cols], train_label['label'], test_label[cols], 'xgb',valid[cols])


In [None]:
explainer = shap.TreeExplainer(clf)

shap_values = explainer.shap_values(train_label[cols])
#shap.summary_plot(shap_values[1], train_label[cols])
shap.summary_plot(shap_values, train_label[cols])

In [None]:
pred = xgb_pred
test_label['label'] = (pred.reshape((-1))>0.42).astype('int')
result=test_label[['账户代号','label']]
result.columns=['zhdh','black_flag']
result.to_csv('submission20230305.csv', index=False)
result['black_flag'].value_counts()