In [11]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine

wine_all = load_wine()
data = wine_all['data']
target = wine_all['target']

In [12]:
wine = pd.DataFrame(data,columns=['Alcohol', 'Malic acid', 'Ash','Alcalinity of ash', 'Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','proline'])
wine['class'] = target
wine

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [14]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target,test_target = train_test_split(data,target,random_state=701)


# <<<트리의 앙상블 모델 사용하기 >>>

## 랜덤포레스트 사용

In [15]:
# 랜덤포레스트 객체생성 : 코어 모두 사용
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs= -1 , random_state=42)

# 교차검증 진행
# - return_train_score : 검증결과 반환받기
from sklearn.model_selection import cross_validate
scores = cross_validate(rfc,train_input, train_target, return_train_score = True, n_jobs=-1)

# 최종 훈련평가 결과 및 검증결과
print(scores)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

{'fit_time': array([0.1511445 , 0.1345787 , 0.13896871, 0.15133238, 0.13495827]), 'score_time': array([0.01598167, 0.01498318, 0.01597929, 0.01598048, 0.01495981]), 'test_score': array([0.96296296, 1.        , 1.        , 0.92307692, 1.        ]), 'train_score': array([1., 1., 1., 1., 1.])}
1.0 0.9772079772079773


In [16]:
rfc.fit(train_input , train_target)
## 특성중요도 조회하기
# 랜덤포레스트 여러가지 골고루 사용 
print(rfc.feature_importances_)

[0.13954952 0.02999022 0.00760619 0.02769645 0.04618161 0.04289784
 0.17073268 0.01780178 0.01667227 0.15710057 0.07913279 0.10218396
 0.16245412]


## 엑스트라 트리 사용

In [17]:
### 코어 전체 사용, train 및 test 결과값 출력
### 교차검증 train 및 test 결과 확인

from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_jobs= -1 , random_state=42)

from sklearn.model_selection import cross_validate
scores = cross_validate(etc,train_input, train_target,return_train_score=True,n_jobs=-1)

# 최종 훈련평가 결과 및 검증 결과
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9772079772079773


In [18]:
etc.fit(train_input, train_target)
print(etc.feature_importances_)

[0.13636261 0.04148243 0.02934597 0.03726208 0.04263609 0.05686422
 0.13930295 0.03405592 0.02661763 0.10974141 0.07779011 0.12114703
 0.14739155]


## 그래디언트 부스팅 사용

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
gdc = GradientBoostingClassifier(random_state=42)

from sklearn.model_selection import cross_validate
scores = cross_validate(gdc,train_input, train_target,return_train_score=True,n_jobs=-1)

# 최종 훈련평가 결과 및 검증 결과
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9695156695156694


In [20]:
gdc.fit(train_input, train_target)
print(gdc.feature_importances_)

[1.52607378e-02 2.92473225e-02 2.98859073e-03 1.99402964e-03
 4.33720956e-02 4.46848353e-03 5.92250483e-02 9.79692351e-06
 1.16575660e-04 2.95115879e-01 1.88429759e-03 2.50977522e-01
 2.95339620e-01]


##### 학습률 적용

In [21]:
gdc = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=0.1,
                                 random_state=42)

from sklearn.model_selection import cross_validate
scores = cross_validate(gdc,train_input, train_target,return_train_score=True,n_jobs=-1)
scores
# 최종 훈련평가 결과 및 검증 결과
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9695156695156694


## 히스토그램 기반 그래디언트 부스팅

In [22]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier(random_state=42)
scores= cross_validate(hgbc,train_input, train_target,return_train_score=True,n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9695156695156696


In [24]:
hgbc.fit(train_input, train_target)
hgbc.score(test_input, test_target)

0.9777777777777777