In [None]:
#gradient boosting regression tree : 여러개의 decision tree를 묶어 강력한 model을 만드는 앙상블 기법
#random forest와 달리 이전 tree의 오차를 보완하는 방식으로 tree생성
#1~5 정도 깊이의 tree를 사용하므로 메모리를 적게 사용하고 예측도 빠름
#얕은 트리들을 계속 연결, parameter설정에서 random forest보다 민감, 잘 조정하면 높은 정확도 제공
#learnig_rate를 높이면 보정을 강하게 하기 때문에 복잡한 모델을 생성
#n_estimator값을 키우면 ensemble에 트리가 더 많이 추가되어 모델의 복잡도가 커지고 train세트를 더 정확하게 figure
# https://woolulu.tistory.com/30

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import *
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree

In [3]:
plt.rcParams['figure.figsize'] = [7, 7]
sns.set(style='darkgrid')
plt.rcParams['scatter.edgecolors'] = 'black'
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)
pd.set_option('display.unicode.east_asian_width', True)

In [4]:
iris_dataset = load_iris()
iris = pd.DataFrame(iris_dataset.data,
        columns=iris_dataset.feature_names)
labels = iris_dataset.target_names
iris.info()
print(iris.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [5]:
label = iris_dataset.target
print(label)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
scaler = StandardScaler()
iris = scaler.fit_transform(iris)
Features = pd.DataFrame(iris, columns=['SL', 'SW', 'PL', 'PW'])
print(Features.shape)

(150, 4)


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
            Features, label, test_size=0.2)

In [None]:
n_estimators = [50, 100, 150, 200, 250, 300, 350, 400]
learning_rate = [0.1, 0.2, .3, .4, .5, .6, .7, .8, .9, 1.0]
max_depth = [2, 3, 4, 5, 6, 7, 8]
param = {'n_estimators':n_estimators, 'learning_rate':learning_rate,
         'max_depth':max_depth}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, 
                            random_state=868)
iris_GBC = GridSearchCV(estimator=GradientBoostingClassifier(),
            param_grid=param, scoring='accuracy',
            n_jobs=-1, cv=cv)
iris_GBC.fit(X_train, Y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=868, test_size=0.2,
            train_size=None),
             estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                           0.8, 0.9, 1.0],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8],
                         'n_estimators': [50, 100, 150, 200, 250, 300, 350,
                                          400]},
             scoring='accuracy')

In [None]:
print(iris_GBC.best_score_)
print(iris_GBC.best_params_)
print(iris_GBC.best_estimator_)

0.95
{'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 50}
GradientBoostingClassifier(learning_rate=0.5, max_depth=4, n_estimators=50)


In [None]:
for i in range(1, 100):
    X_train, X_test, Y_train, Y_test = train_test_split(
            Features, label, test_size=0.2, random_state=i)
    iris_GBC_best = iris_GBC.best_estimator_
    iris_GBC_best.fit(X_train, Y_train)

    train_score = iris_GBC_best.score(X_train, Y_train)
    test_score = iris_GBC_best.score(X_test, Y_test)
    if test_score >= train_score:
        print('test:{} train:{} random_state:{}'.format(
            test_score, train_score, i))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
            Features, label, test_size=0.2, random_state=42)
iris_GBC_best = iris_GBC.best_estimator_
iris_GBC_best.fit(X_train, Y_train)

train_score = iris_GBC_best.score(X_train, Y_train)
test_score = iris_GBC_best.score(X_test, Y_test)

print('test:{} train:{}'.format(
        test_score, train_score))

test:1.0 train:1.0


In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(
            Features, label, test_size=0.2, random_state=42)
iris_GBC_best = GradientBoostingClassifier(learning_rate=0.4, max_depth=3, n_estimators=30)
iris_GBC_best.fit(X_train, Y_train)

train_score = iris_GBC_best.score(X_train, Y_train)
test_score = iris_GBC_best.score(X_test, Y_test)

print('test:{} train:{}'.format(
        test_score, train_score))

test:1.0 train:1.0


In [None]:
pd.DataFrame(confusion_matrix(Y_test, iris_GBC_best.predict(X_test)),
        columns=['P_setosa', 'P_versicolor', 'P_virginica'],
        index=['A_setosa', 'A_versicolor', 'A_virginica'])

Unnamed: 0,P_setosa,P_versicolor,P_virginica
A_setosa,10,0,0
A_versicolor,0,9,0
A_virginica,0,0,11


In [None]:
print(classification_report(Y_test, iris_GBC_best.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

