# 相关设置

In [1]:
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

In [6]:
EDA_data = pd.read_csv('data/after_EDA/EDA_data.csv')
print(EDA_data.shape)

(26956, 18)


In [7]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(EDA_data, test_size=0.2, random_state=412)

print(train_set.shape)
print(test_set.shape)

(21564, 18)
(5392, 18)


In [8]:
X_train = train_set[train_set.columns.difference(['is_pass'])].values
y_train = train_set[['is_pass']].values
X_test = test_set[test_set.columns.difference(['is_pass'])].values
y_test = test_set[['is_pass']].values

# 评估方法

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# 随机森林

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_model.fit(X_train, y_train.ravel())
y_predict = rf_model.predict(X=X_test)

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))

RandomForestClassifier(n_estimators=10, n_jobs=1)

accuracy_score:  0.6958456973293768
confusion_matrix:  [[ 735  930]
 [ 710 3017]]
roc_auc_score:  0.6254698487056953
classification_report:                precision    recall  f1-score   support

           0       0.51      0.44      0.47      1665
           1       0.76      0.81      0.79      3727

    accuracy                           0.70      5392
   macro avg       0.64      0.63      0.63      5392
weighted avg       0.69      0.70      0.69      5392



# LightGBM

In [13]:
import lightgbm as lgb

trn_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_test, y_test)

params = {  
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 7,  
    'metric': 'multi_error',  
    'num_leaves': 120,  
    'min_data_in_leaf': 100,  
    'learning_rate': 0.06,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1, 
}

lgb_model = lgb.train(params, 
                      trn_data, 
                      num_boost_round = 1000,
                      valid_sets = [trn_data,val_data], 
                      verbose_eval = 100, 
                      early_stopping_rounds = 100)

y_prob = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
y_predict = [list(x).index(max(x)) for x in y_prob]

print("accuracy_score: ", accuracy_score(y_test, y_predict))
print('confusion_matrix: ', confusion_matrix(y_test, y_predict))
print('roc_auc_score: ', roc_auc_score(y_test, y_predict))
print('classification_report: ', classification_report(y_test, y_predict))



Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.251438	valid_1's multi_error: 0.283012
Early stopping, best iteration is:
[62]	training's multi_error: 0.256539	valid_1's multi_error: 0.281157
accuracy_score:  0.7188427299703264
confusion_matrix:  [[ 505 1160]
 [ 356 3371]]
roc_auc_score:  0.6038920594863713
classification_report:                precision    recall  f1-score   support

           0       0.59      0.30      0.40      1665
           1       0.74      0.90      0.82      3727

    accuracy                           0.72      5392
   macro avg       0.67      0.60      0.61      5392
weighted avg       0.70      0.72      0.69      5392

