In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tools.feature_selection as fs
import tools.preprocessing as ps
import tools.evaluations as ev

## 读取数据

In [2]:
df_train = pd.read_csv('../project/home-credit-default-risk/application_train.csv')

## 数据处理

In [3]:
# 变量分类
cate_vars = []
cont_vars = []
for col in df_train.columns:
    if col not in ['SK_ID_CURR','TARGET']:
        if df_train[col].dtype in ['float64','int64'] and df_train[col].nunique() > 3:
            cont_vars.append(col)
        else:
            cate_vars.append(col)

In [4]:
# 编码类别变量
df_train = ps.cate_enc(df_train, cate_vars)

## 特征筛选

In [None]:
df_train_woe = df_train.fillna(-1)
df_train_woe['target'] = df_train_woe['TARGET']
df_train_woe.drop('TARGET',axis=1,inplace=True)

In [None]:
df_iv = fs.cal_iv(df_train_woe,cate_vars,cont_vars,'target')

In [None]:
df_iv.to_excel('./features/df_train_iv.xlsx',index=False)

In [5]:
df_iv = pd.read_excel('./features/df_train_iv.xlsx')
iv_features_train = df_iv[df_iv['iv']>0.02]['var_name'].values

## 建模

In [6]:
# 根据选择的模型决定用不用填补缺失值
df_train.fillna(-1,inplace=True)

In [7]:
# 将TARGET编码为分类变量
df_train['TARGET'] = df_train['TARGET'].astype('category')

In [8]:
# 分割数据集
X_train, X_test, y_train, y_test = ps.split_data(df_train,iv_features_train,'TARGET')

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 训练模型
model = RandomForestClassifier(max_depth=9,n_estimators=100)
model.fit(X_train, y_train)

# 输出概率
y_hat = model.predict_proba(X_train)[:,1]
y_pred = model.predict_proba(X_test)[:,1]

# 模型评价
ev.plot_ROC(y_train, y_hat, 'Train')
ev.plot_ROC(y_test, y_pred, 'Test')