# <center>***感受树模型的压迫感***<center>

In [None]:
# 下载三个树模型机器学习库
%pip install xgboost
%pip install catboost
%pip install lightgbm

## <font color=#FF0000>**1、导入需要的模块与数据**

In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split

import xgboost as xgb
import catboost as cat
import lightgbm as lgb

### **简单的数据读入**

In [None]:
def load_data():

    df1 = pd.read_csv(".\\data\\train_u6lujuX_CVtuZ9i.csv")
    df1 = df1.drop("Loan_ID", axis=1)

    df1['Gender'].fillna(df1['Gender'].mode()[0], inplace=True)
    df1['Married'].fillna(df1['Married'].mode()[0], inplace=True)
    df1['Dependents'].fillna(df1['Dependents'].mode()[0], inplace=True)
    df1['Self_Employed'].fillna(df1['Self_Employed'].mode()[0], inplace=True)
    df1['LoanAmount'].fillna(df1['LoanAmount'].mean(), inplace=True)
    df1['Loan_Amount_Term'].fillna(df1['Loan_Amount_Term'].mode()[0], inplace=True)
    df1['Credit_History'].fillna(df1['Credit_History'].mode()[0], inplace=True)

    return df1

df1 = load_data()

### **简单的数据清洗与划分**

In [None]:
def process(ratio=0.8):

    X = df1.drop("Loan_Status", axis=1)
    y = df1.Loan_Status

    X = pd.get_dummies(X)
    y = y.map({"Y": 1, "N": 0})

    feature_name = X.columns
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ratio, random_state=1)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = process()

## <font color=#FF0000>**2、初始化模型与剪枝**

### **XGBoost**

In [None]:
# 尽量把学习率设置的高点，这样迭代的时间会短些
model1 = xgb.XGBClassifier(learning_rate=0.1)
param1 = {
        "max_depth":[4, 7, 10],
        'min_child_weight': [1, 3, 5],
        "n_estimators":[10, 50, 100],
        'gamma': [0.1, 0.3, 0.5],
        'reg_alpha': [0.05, 0.1, 1], 
        'reg_lambda': [0.05, 0.1, 1]
        }

grid_search1 = GridSearchCV(model1, n_jobs=-1, param_grid=param1, cv=5, scoring="roc_auc")
grid_search1.fit(X_train, y_train)
grid_search1.best_estimator_, grid_search1.best_score_

### **Catboost**

In [None]:
model2 = cat.CatBoostClassifier(learning_rate=0.1)
param2 = {
        "depth":[4, 6, 8, 10],
        "iterations":[100, 200, 400, 700, 1000],
        'max_leaves':[5, 15, 25, 35],
        'l2_leaf_reg': [1, 3, 5, 7, 9]
        }

grid_search2 = GridSearchCV(model1, n_jobs=-1, param_grid=param1, cv=5, scoring="roc_auc")
grid_search2.fit(X_train, y_train)
grid_search2.best_estimator_, grid_search2.best_score_

### **LightGBM**

In [None]:
model3 = lgb.LGBMClassifier(learning_rate=0.1)
param3 = {
        "max_depth":[4, 7, 10],
        "num_leaves":[300, 600, 900],
        "n_estimators":[10, 70, 130],
        'min_child_samples': [18, 20, 22],
        'min_child_weight':[0.001, 0.002]
        }

grid_search3 = GridSearchCV(model3, n_jobs=-1, param_grid=param3, cv=5, scoring="roc_auc")
grid_search3.fit(X_train, y_train)
grid_search3.best_estimator_, grid_search3.best_score_

## <font color=#FF0000>**3、模型建立（Scikit-learn接口）**

In [None]:
# 变量grid_search.best_estimator_以及自动存储了使用最佳参数组合的模型
model1_plus = grid_search1.best_estimator_
y_pred = model1_plus.fit(X_train, y_train).predict(X_test)

## <font color=#FF0000>**4、模型评价**

### <center>**分类指标评价**

In [None]:
from sklearn.metrics import classification_report

# 打印分类模型最好的评价系统
print("评估数据结果打印:\n", classification_report(y_test, y_pred))

### <center>**可视化评价**

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns

# Confusion Matrix
mat = confusion_matrix(y_test, y_pred, labels=[1, 0])

# fpr, tpr
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
AUC_ROC = roc_auc_score(y_test, y_pred)

plt.figure(figsize=(15, 7))

# 形成热力图
plt.subplot(1, 2, 1)
sns.heatmap(mat, annot=True, square="equal", cmap="OrRd", fmt="d",
    xticklabels=['Yes', 'No'], yticklabels=['Yes', 'No'])
plt.xlabel("Predict Labels")
plt.ylabel("Labels")
plt.title("Confusion Matrix")

# ROC曲线
plt.subplot(1, 2, 2)
plt.plot(fpr,tpr,'-',label='Area Under the Curve (AUC = %0.4f)' % AUC_ROC)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('ROC curve')
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.legend(loc="lower right")

plt.show()