# 一个小目标 mission three
---


## 任务
分别使用 xgboost 与 lightgbm 对客户逾期情况进行建模，预测用户是否会逾期


In [None]:
# 导入需要的包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, f1_score, classification_report
from matplotlib import pyplot as plt

# 设置 pandas 显示列数
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)

## 1. 导入数据

In [2]:
# 因为数据并非utf-8编码，要使用gbk编码读入，否则出错
data = pd.read_csv('./data/data.csv', index_col=0, encoding='gbk')

# 观察数据构成
# data.head()

In [3]:
# 划分 X, y
y = data['status']
X = data.drop('status', axis=1)

# X行数
print('X.shape:', X.shape)
print('y 的分布\n', y.value_counts())

X.shape: (4754, 88)
y 的分布
 0    3561
1    1193
Name: status, dtype: int64


## 2. 数据探索及特征处理
本次任务目的在于主流程，所以数据探索部分做得比较粗糙，以后有需要再慢慢补充
从上面数据看出，本份数据以数值型特征位数，有少数几个字符型特征，还有两个日期特征。下面一步步进行处理

In [4]:
# 首先剔除一些明显无用的特征，如 id_name, custid, trade_no, bank_card_no，
# 这些优点类似一个人的唯一信息，如果加入模型训练且对最终模型生效的话，很可能就是出现了过拟合
X.drop(['id_name', 'custid', 'trade_no', 'bank_card_no'], axis=1, inplace=True)

# 数值型变量
X_num = X.select_dtypes('number').copy()
# student_feature
X_num.fillna({'student_feature': 0}, inplace=True)
# 其他数值型变量使用均值代替
X_num.fillna(X_num.mean(), inplace=True)

# 字符型变量
X_str = X.select_dtypes(exclude='number').copy()
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])

# 合并
X_cl = pd.concat([X_num, X_str_dummy], axis=1, sort=False)
X_cl.shape

(4754, 85)

### 3. 数据划分
三七 分，随机种子就取今天日期吧

In [5]:
random_state = 1115
X_train, X_test, y_train, y_test = train_test_split(X_cl, y, test_size=0.3, random_state=random_state)
print(X_train.shape)
print(X_test.shape)

(3327, 85)
(1427, 85)


In [6]:
# 数据归一化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.transform(X_test)

### 4. 建模
#### model1-LR

In [15]:
# 先用网格搜索选下超参，
# 评价参数没有给出，这次使用 f1_micro 作为评价标准
lr = LogisticRegression()
param_grid = {
    'C': [0.05, 0.1, 0.5, 1, 5],
    'penalty': ['l1', 'l2']
}

grid = GridSearchCV(lr, param_grid=param_grid, scoring='f1_micro')

grid.fit(X_train_std, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'C': 0.05, 'penalty': 'l1'}
0.793808235648


In [16]:
# 使用训练好的超参进行建模
lr = LogisticRegression(**grid.best_params_)
lr.fit(X_train_std, y_train)

# 准确性
y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.7986
Test：0.8031
f1_score：
Train：0.4444
Test：0.4565


#### model2-svm

In [11]:
from sklearn.svm import SVC, LinearSVC
# 线性 SVM
lsvc = LinearSVC()
lsvc.fit(X_train_std, y_train)

# 准确性
y_train_pred = lsvc.predict(X_train_std)
y_test_pred = lsvc.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.7977
Test：0.8031
f1_score：
Train：0.4524
Test：0.4586


In [12]:
# 非线性 SVM
svc = SVC()
svc.fit(X_train_std, y_train)

# 准确性
y_train_pred = svc.predict(X_train_std)
y_test_pred = svc.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.8323
Test：0.7996
f1_score：
Train：0.5295
Test：0.4066


#### model3-决策树

In [13]:
from sklearn.tree import DecisionTreeClassifier
# 决策树
# 线性 SVM
dt = DecisionTreeClassifier()
dt.fit(X_train_std, y_train)

# 准确性
y_train_pred = dt.predict(X_train_std)
y_test_pred = dt.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

# 这里其实有点过拟合了，在训练集上的预测是 100%。但是这点暂时不处理，以后调参的时候再进行研究

准确性：
Train：1.0000
Test：0.7043
f1_score：
Train：1.0000
Test：0.4139


#### model4-XGBoost

**安装**

安装包路径：https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost

找到合适的包，使用pip安装。如 pip install xgboost-0.81-cp27-cp27m-win_amd64.whl

**使用**

下面所说仅仅是普通调用，然而 XGB 有很多超参数需要调整，而且这些超参对结果会产生很大的影响，这次先不展开阐述。

参考：[XGBoost使用教程（纯xgboost方法）](https://blog.csdn.net/u011630575/article/details/79418138)

In [25]:
# 使用 xgbboost 原生的模型调用
import xgboost as xgb

# 定义样本
dtrain = xgb.DMatrix(X_train_std, label=y_train)
# 定义 xgb 参数
xgb_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'nthread': 4,
    'scale_pos_weight': 1,
    'seed': 112
}
# 通过交叉验证寻找最优迭代次数
cvresult = xgb.cv(xgb_params, dtrain, 
                  num_boost_round=1000,  # 最大迭代次数
                  nfold=5,  # n 折交叉检验
                  metrics='auc',  # 评价指标
                  early_stopping_rounds=30,  # 早停，如果n轮后效果均没有提升，则停止
                  verbose_eval=False,  # 打印日志
                 )
xgb_params.update(n_estimators=cvresult.shape[0])  # 结果行数 即 最优迭代次数
cvresult

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.786727,0.006035,0.711017,0.016283
1,0.820915,0.005441,0.73316,0.021281
2,0.837428,0.003187,0.738238,0.022022
3,0.852458,0.003771,0.747465,0.018232
4,0.860994,0.004971,0.747937,0.018504
5,0.8664,0.004904,0.754532,0.017471
6,0.87166,0.004822,0.75791,0.013978
7,0.876657,0.004699,0.762,0.01411
8,0.882297,0.005793,0.76296,0.015213
9,0.886179,0.004136,0.764889,0.016537


In [35]:
# 重新训练模型（这里一定要给出迭代次数）
xgb_model1 = xgb.train(xgb_params, xgb_train, num_boost_round=xgb_params['n_estimators'])
# 预测出来的是第一类的数量，注意，这里传入的数据也需要通过 Dmatrix 转换，否则报错
y_train_pred_proba = xgb_model1.predict(xgb.DMatrix(X_train_std))
y_test_pred_proba = xgb_model1.predict(xgb.DMatrix(X_test_std))
# 转换成分类
y_train_pred = (y_train_pred_proba >= 0.5) + 0
y_test_pred = (y_test_pred_proba >= 0.5) + 0

# 评估
# 准确性
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.8798
Test：0.7940
f1_score：
Train：0.7028
Test：0.4712


In [37]:
# xgbboost 对于sklearn的支持，调用起来更加方便
from xgboost.sklearn import XGBClassifier
xgb_model2 = XGBClassifier(**xgb_params) # 迭代次数(n_estimators)已经是超参之一
print(xgb_model2)

# 训练
xgb_model2.fit(X_train_std, y_train)
# 预测
y_train_pred = xgb_model2.predict(X_train_std)
y_test_pred = xgb_model2.predict(X_test_std)

# 评估
# 准确性
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=42,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=112,
       silent=True, subsample=0.8)
准确性：
Train：0.8798
Test：0.7940
f1_score：
Train：0.7028
Test：0.4712


这里需要注意的是，两个不同调用方法在具体使用上略有差异，使用需要留意

#### model5-LightGBM

**安装**

使用 pip install lightgbm 直接安装

**使用**

这里的用法与xgb的非常类似

参考：[【集成学习】lightgbm使用案例](https://www.cnblogs.com/wanglei5205/p/8654041.html)

In [31]:
# 使用 lightgbm 原生的模型调用
import lightgbm as lgb

lgb_params  = {
    'learning_rate': 0.1,
    'n_estimators': 42,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary', # 这里和 xgb 不一样
    'nthread': 4,
    'scale_pos_weight': 1,
    'seed': 112
}

dtrain = lgb.Dataset(X_train_std, y_train)

# 重新训练模型（这里一定要给出迭代次数）
lgb_model1 = lgb.train(lgb_params, lgb_train, num_boost_round=xgb_params['n_estimators'])
# 预测出来的是第一类的数量，这里预测的时候只需要传入原数据就可以，不用转换成 Dataset，否则报错
y_train_pred_proba = lgb_model1.predict(X_train_std)
y_test_pred_proba = lgb_model1.predict(X_test_std)
# 转换成分类
y_train_pred = (y_train_pred_proba >= 0.5) + 0
y_test_pred = (y_test_pred_proba >= 0.5) + 0

# 评估
# 准确性
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))



准确性：
Train：0.8843
Test：0.7996
f1_score：
Train：0.7154
Test：0.4743


In [38]:
# 同理，lightgbm 也有对于sklearn的支持
from lightgbm.sklearn import LGBMClassifier
lgb_model2 = LGBMClassifier(**lgb_params) # 迭代次数(n_estimators)已经是超参之一
print(lgb_model2)

# 训练
lgb_model2.fit(X_train_std, y_train)
# 预测
y_train_pred = lgb_model2.predict(X_train_std)
y_test_pred = lgb_model2.predict(X_test_std)

# 评估
# 准确性
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        gamma=0, importance_type='split', learning_rate=0.1, max_depth=5,
        min_child_samples=20, min_child_weight=1, min_split_gain=0.0,
        n_estimators=42, n_jobs=-1, nthread=4, num_leaves=31,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, scale_pos_weight=1, seed=112, silent=True,
        subsample=0.8, subsample_for_bin=200000, subsample_freq=0)
准确性：
Train：0.8843
Test：0.7996
f1_score：
Train：0.7154
Test：0.4743
