In [1]:
import re
import io 
import sys 
import warnings
import dateutil
import datetime 
import pandas as pd 
import numpy as np 
import seaborn as sns 
from pandas. io import sql 
import matplotlib. pyplot as plt 
import matplotlib. ticker as mtick
from sklearn. utils import shuffle 
from sklearn. model_selection import train_test_split 
from sklearn. linear_model import LogisticRegression 
from sklearn. metrics import roc_curve, auc 
import statsmodels. api as sm 
import toad 
from toad.metrics import KS, AUC 
from toad.plot import badrate_plot, bin_plot
sys.path.append("..")
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_info_columns', 500)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False 

  import pandas.util.testing as tm


In [None]:
from SCtools import DataPreprocessing, VarsFilter, BuildModel, ValidationModel, PersistModel, PredictScore

### 1 数据预处理

In [None]:
data = pd.read_excel('./ccb smp s.xlsx')
feat_info = pd.read_excel('./var_list_new2.x1sx')
print("训练数据集样本量：{}，其中负样本量：{}，正样本量：{}".format(data.shape[0], data['target'].sum(), 
                                                                    data.shape[0] - data['target'].sum()))

In [None]:
data_info = toad.detector.detect(data)
data_info

In [None]:
data_iv = toad.quality(data, target='target')

In [None]:
dp = DataPreprocessing(data, 'target', feat_info)
# 特征粗过滤
drop_dict = dp.feat_filter(empty=0.5, iv=0.02, corr=1)
# 划分训练集和测试集
dp.split_data(test_size=0.3, random_state=6545, stratify=dp.selected_data[dp.y])
# 分箱 + WOE转换
combiner = toad.transform.Combiner()
binned_data_tr = combiner.fit_transform(dp.data_tr, y=dp.y, method='chi', min_samples=0.05, 
                                        exclude=['type'], empty_separate=True)
data_tr_woe, data_ts_woe = dp.bins_woe(combiner, binned_data_tr)

In [None]:
adj_bins = {}
data_tr_woe, data_ts_woe = dp.adj_bins_woe(adj_bins=adj_bins, empty_separate=True)

### 2 特征选择

- PSI
- IV值
- 相关性
- stepwise
- 共线性

In [None]:
vf = VarsFilter(data_tr_woe, data_ts_woe)
vf.vars_stability(psi=0.1)
vf.filter_iv(iv=0.01)
vf.filter_corr(corr=0.8)
data_tr, drop_lst = toad.selection.stepwise(vf.data_tr, estimator='lr', direction='both', criterion='bic',
                                            return_drop=True)
vf.filter_stepwise(data_tr, drop_lst)
vf.filter_vif(vif=3)

### 3 逻辑回归模型

- 变量系数符号筛选
- 变量p-value筛选

In [None]:
bm = Build_Model(vf.data_tr)
# 变量系数筛选：系数均为正
bm.filter_coef_pos()
# 变量p-value筛选
bm.filter_p_value(p_thred=0.5)

In [None]:
lr = LogisticRegression(class_weight='balanced', C=4)
clf = bm.train(lr)

In [None]:
# 特征信息表
df_iv = toad.quality(data_tr_woe[bm.remain_vars + ['target']], target='target')
df_iv['chn_name'] = df_iv.index.map(dp.index_name_map)
df_coef = pd.DataFrame(lr.coef_[0], bm.remain_vars, columns=['coef'])
df_pv = pd.DataFrame(bm.est.pvalues, columns=['p-value'])
df_vif = pd.DataFrame(toad.stats.VIF(data_tr_woe[bm.remain_vars]), columns=['vif'])
df_psi= pd.DataFrame(toad.metrics.PSI(data_tr_woe[bm.remain_vars], data_ts_woe[bm.remain_vars]), columns=['psi'])
df_info = pd.merge(df_iv, df_coef, left_index=True, right_index=True, how='left')
df_info = pd.merge(df_info, df_pv, left_index=True, right_index=True, how='left')
df_info = pd.merge(df_info, df_vif, left_index=True, right_index=True, how='left')
df_info = pd.merge(df_info, df_psi, left_index=True, right_index=True, how='left')
df_info.index.name = 'eng_name'
df_info = df_info[['chn_name', 'coef', 'iv', 'p-value', 'vif', 'psi']].reset_index()
df_info = pd.merge(feat_info[['feat_id', 'feat_2']], df_info, left_on='feat_id', right_on='eng_name', how='right')
df_info = df_info.drop('feat_id', axis=1）
df_info.rename(columns={'feat_2': 'feat_type'}, inplace=True)
df_info['feat_type'] = df_info['feat_type'].fillna('基本信息')

### 4 模型评估

- AUC/KS
- plot
- divergence

In [None]:
vm = ValidationModel(data_tr_woe, data_ts_woe, bm.remain_vars, y='target')
# 若模型的训练集和测试集相差过大，查看5中单变量分析，合并分箱或剔除不稳定的变量
vm.ks_auc(clf)

In [None]:
toad.metrics.PSI(vm.EYtr_proba, vm.EYts_proba)

In [None]:
vm.plot_roc_curve(vm.Ytr, vm.EYtr_proba, co_labels=vm.Yts, co_preds=vm.EYts_proba)

In [None]:
vm.plot_ks_curve(vm.Ytr, vm.EYtr_proba, title='Train data KS Curve')

In [None]:
vm.plot_ks_curve(vm.Yts, vm.EYts_proba, title='Test data KS Curve')

In [None]:
vm.plot_divergence(vm.Ytr, vm.EYtr_proba)

### 5 单变量分析

**badrate**

- train_test_badrate：查看训练集和测试的badrate是否交叉，对于交叉的分箱可以选择合并；
- bins_badrate：查看变量分箱的badrate是否单调，不单调的变量根据实际情况判断是否删除；

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']
dp.train_test_badrate(bm.remain_vars)

In [None]:
dp.bins_badrate(bm.remain_vars)

In [None]:
for i in range(len(bm.remain_vars)):
    print("'" + bm.remain_vars[i] + "':", dp.bins[bm.remain_vars[i]])

### 6 模型报告

In [None]:
sys.path.append("..")
from Metrics import plot_ks_badrate, plot_ks_count
tr_bucket_q = toad.metrics.KS_bucket(vm.EYtr_proba, vm.Ytr, bucket=10, method='quantile')
tr_bucket_s = toad.metrics.KS_bucket(vm.EYtr_proba, vm.Ytr, bucket=10, method='step')

In [None]:
vm.plot_ks_badrate(tr_bucket_q, legend='min', plot_lst=['bad_rate', 'ks'])

In [None]:
vm.plot_badrate_count(tr_bucket_q, legend='min', plot_lst=['bad_rate', 'total'])

In [None]:
vm.plot_badrate_count(tr_bucket_s, legend='min', plot_1st=['bad_rate', 'total'])

### 7 生成评分卡

默认参数：

- base_score：600
- pdo：40
- base_odds：5

可根据生成的评分卡score_card进行评分

In [None]:
card = toad.scorecard.ScoreCard(combiner=dp.combiner, 
                                transer=dp.transer, 
                                class_weight='balanced', 
                                C=4, 
                                base_score=600, 
                                base_odds=3, 
                                pdo=40)
card.fit(vm.xtr, vm.Ytr)
score_card = card.export(to_frame=True)

### 8 预测评分

In [None]:
ss = dp.data_all[bm.remain_vars + ['target']]
score = card.predict(ss[bm.remain_vars])
score_report = pd.DataFrame(score, columns=['score'])
score_report['target'] = ss['target'].values

In [None]:
score_report = score_report[score_report.score > 420]
score_report['bins'] = pd.cut(score_report.score, 10, precision=1)
bins_badrate = score_report.groupby('bins')['target'].agg(['count', 'mean']).reset_index()

In [None]:
vm.plot_score_badrate_count(bins_badrate)