# 一个小目标 mission two
---


## 任务
分别使用 SVM 与 决策树 对客户逾期情况进行建模，预测用户是否会逾期


In [1]:
# 导入需要的包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, f1_score, classification_report
from matplotlib import pyplot as plt

# 设置 pandas 显示列数
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)

## 1. 导入数据

In [2]:
# 因为数据并非utf-8编码，要使用gbk编码读入，否则出错
data = pd.read_csv('./data/data.csv', index_col=0, encoding='gbk')

# 观察数据构成
# data.head()

In [3]:
# 划分 X, y
y = data['status']
X = data.drop('status', axis=1)

# X行数
print('X.shape:', X.shape)
print('y 的分布\n', y.value_counts())

X.shape: (4754, 88)
y 的分布
 0    3561
1    1193
Name: status, dtype: int64


## 2. 数据探索及特征处理
本次任务目的在于主流程，所以数据探索部分做得比较粗糙，以后有需要再慢慢补充
从上面数据看出，本份数据以数值型特征位数，有少数几个字符型特征，还有两个日期特征。下面一步步进行处理

In [4]:
# 首先剔除一些明显无用的特征，如 id_name, custid, trade_no, bank_card_no，
# 这些优点类似一个人的唯一信息，如果加入模型训练且对最终模型生效的话，很可能就是出现了过拟合
X.drop(['id_name', 'custid', 'trade_no', 'bank_card_no'], axis=1, inplace=True)

# 数值型变量
X_num = X.select_dtypes('number').copy()
# student_feature
X_num.fillna({'student_feature': 0}, inplace=True)
# 其他数值型变量使用均值代替
X_num.fillna(X_num.mean(), inplace=True)

# 字符型变量
X_str = X.select_dtypes(exclude='number').copy()
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])

# 合并
X_cl = pd.concat([X_num, X_str_dummy], axis=1, sort=False)
X_cl.shape

(4754, 85)

### 3. 数据划分
三七 分，随机种子就取今天日期吧

In [6]:
random_state = 1115
X_train, X_test, y_train, y_test = train_test_split(X_cl, y, test_size=0.3, random_state=random_state)
print(X_train.shape)
print(X_test.shape)

(3327, 85)
(1427, 85)


In [7]:
# 数据归一化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.transform(X_test)

### 4. 建模
#### model1-LR

In [15]:
# 先用网格搜索选下超参，
# 评价参数没有给出，这次使用 f1_micro 作为评价标准
lr = LogisticRegression()
param_grid = {
    'C': [0.05, 0.1, 0.5, 1, 5],
    'penalty': ['l1', 'l2']
}

grid = GridSearchCV(lr, param_grid=param_grid, scoring='f1_micro')

grid.fit(X_train_std, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'C': 0.05, 'penalty': 'l1'}
0.793808235648


In [16]:
# 使用训练好的超参进行建模
lr = LogisticRegression(**grid.best_params_)
lr.fit(X_train_std, y_train)

# 准确性
y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.7986
Test：0.8031
f1_score：
Train：0.4444
Test：0.4565


#### model2-svm

In [11]:
from sklearn.svm import SVC, LinearSVC
# 线性 SVM
lsvc = LinearSVC()
lsvc.fit(X_train_std, y_train)

# 准确性
y_train_pred = lsvc.predict(X_train_std)
y_test_pred = lsvc.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.7977
Test：0.8031
f1_score：
Train：0.4524
Test：0.4586


In [12]:
# 非线性 SVM
svc = SVC()
svc.fit(X_train_std, y_train)

# 准确性
y_train_pred = svc.predict(X_train_std)
y_test_pred = svc.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

准确性：
Train：0.8323
Test：0.7996
f1_score：
Train：0.5295
Test：0.4066


#### model3-决策树

In [13]:
from sklearn.tree import DecisionTreeClassifier
# 决策树
# 线性 SVM
dt = DecisionTreeClassifier()
dt.fit(X_train_std, y_train)

# 准确性
y_train_pred = dt.predict(X_train_std)
y_test_pred = dt.predict(X_test_std)
print('准确性：')
print('Train：{:.4f}'.format(accuracy_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(accuracy_score(y_test, y_test_pred)))
# f1_score
print('f1_score：')
print('Train：{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('Test：{:.4f}'.format(f1_score(y_test, y_test_pred)))

# 这里其实有点过拟合了，在训练集上的预测是 100%。但是这点暂时不处理，以后调参的时候再进行研究

准确性：
Train：1.0000
Test：0.7043
f1_score：
Train：1.0000
Test：0.4139
