# 2023-02-22

## Ensemble Model

### Voting

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

iris_data = load_iris()
iris_input_data = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
iris_target_data = pd.DataFrame(iris_data.target, columns = ['target'])
iris = pd.concat([iris_input_data, iris_target_data], axis = 1)
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [2]:
# input, target 분리

X = iris.iloc[:, :-1]
y = iris.iloc[:, -1]

In [3]:
# train, test 분리

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [4]:
# 스케일 변환

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# 앙상블 모델(voting) 모델링

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(multi_class='multinomial', 
                          random_state=1)
clf2 = svm.SVC(kernel='linear', 
               random_state=1) 
clf3 = GaussianNB()

clf_voting = VotingClassifier(
                estimators=[
                    ('lr', clf1), 
                    ('svm', clf2), 
                    ('gnb', clf3)
                ],
                voting='hard',
                weights=[1,1,1])
clf_voting.fit(X_train_scaled, y_train)

In [6]:
# 예측

pred_voting = clf_voting.predict(X_test_scaled)
pred_voting

array([0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 1, 2, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       2, 1, 0, 2, 2, 1, 0, 0])

In [7]:
# X_train_scaled, y_train간의 score 확인하기

clf_voting.score(X_train_scaled, y_train)

0.975

In [9]:
clf_voting.score(X_test_scaled, y_test)

0.9333333333333333

In [10]:
# confusion matrix 확인

from sklearn.metrics import classification_report
cf_matrix = classification_report(y_test, pred_voting)
print(cf_matrix)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.86      0.86      0.86         7
           2       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30



### Bagging

In [11]:
# input, target data 분리된 상태에서 불러오기

# 지도학습 -> classification

from sklearn import datasets
raw_wine = datasets.load_wine()

X = raw_wine.data    # input
y = raw_wine.target  # target

In [15]:
print(X.shape)
print(y.shape)

(178, 13)
(178,)


In [17]:
pd.DataFrame(X, columns = raw_wine.feature_names).head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [30]:
# 대부분 실무에서 데이터를 처음 받았을 때의 형태 (y의 컬럼명을 설정하기 위해 함수 생성)

def decoding(data):
    if data == 0:
        return 'class_0'
    elif data == 1:
        return 'class_1'
    else:
        return 'class_2'

wine_class = pd.DataFrame(y, columns = ['target'])['target'].apply(decoding)

wine_input = pd.DataFrame(X, columns = raw_wine.feature_names)

pd.concat([wine_input, wine_class], axis = 1)

In [35]:
# train / test data split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [36]:
# scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
# modeling

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator = GaussianNB(),
                               n_estimators = 10)

clf_bagging.fit(X_train_scaled, y_train)

In [49]:
# predict

pred_bagging = clf_bagging.predict(X_test_scaled)
print(pred_bagging)

[1 1 0 2 0 2 1 2 2 0 0 1 0 0 2 2 0 0 0 0 0 2 1 1 1 1 1 1 1 2 2 1 0 2 1 1]


In [50]:
clf_bagging.score(X_train_scaled, y_train)

0.9859154929577465

In [51]:
clf_bagging.score(X_test_scaled, y_test)

0.9722222222222222

In [52]:
# confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_bagging)

array([[12,  1,  0],
       [ 0, 13,  0],
       [ 0,  0, 10]], dtype=int64)

In [53]:
# confusion matrix report

from sklearn.metrics import classification_report
cf_matrix = classification_report(y_test, pred_bagging)
print(cf_matrix)

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.93      1.00      0.96        13
           2       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36



### Random Forest

### [실습] random forest vs gradient boosting

In [76]:
# Data 불러오기

from sklearn.datasets import load_breast_cancer
import pandas as pd

datasets = load_breast_cancer()
input_data = pd.DataFrame(datasets.data, columns = datasets.feature_names)
target_data = pd.DataFrame(datasets.target, columns = ['target'])
df = pd.concat([input_data, target_data], axis = 1)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [91]:
# feature, target 데이터 지정

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [104]:
# train / test 데이터 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [105]:
# 데이터 표준화

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [168]:
# random forest

from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=7,
                                n_estimators=90,                       
                               random_state=0)
clf_rf.fit(X_train_scaled, y_train)

print(clf_rf.score(X_train_scaled, y_train))
print(clf_rf.score(X_test_scaled, y_test))

0.9976525821596244
0.986013986013986


In [169]:
# random forest model의 feature importance

rf_importance = clf_rf.feature_importances_
rf_importance

array([0.01300478, 0.01572286, 0.05117274, 0.05260802, 0.00780476,
       0.00843108, 0.06173698, 0.10318878, 0.00609875, 0.00396386,
       0.03244088, 0.00675716, 0.01477069, 0.02866161, 0.0033202 ,
       0.00388306, 0.00620135, 0.00686739, 0.00474037, 0.00459202,
       0.11393903, 0.01324051, 0.16950155, 0.06669623, 0.01434514,
       0.0125794 , 0.03203008, 0.11856039, 0.01279335, 0.01034696])

In [170]:
# prediction

pred_randomforest = clf_rf.predict(X_test_scaled)
pred_randomforest

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [174]:
# random forest model의 accuracy

from sklearn.metrics import accuracy_score

rf_accuracy = accuracy_score(y_test, pred_randomforest)
rf_accuracy

0.986013986013986

In [171]:
# random forest confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_randomforest)

array([[52,  1],
       [ 1, 89]], dtype=int64)

In [172]:
# random forest confusion matrix report

from sklearn.metrics import classification_report
cf_matrix = classification_report(y_test, pred_randomforest)
print(cf_matrix)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        53
           1       0.99      0.99      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143



In [None]:
# random forest의 parameter를 max_depth = 7, n_estimators=90 이 훈련 데이터와 테스트 데이터 간의 score격차가 가장 작으며 confusion matrix의 정밀도, 재현율의 결과가 가장 좋다.
# 양성을 양성으로 제대로 예측한 것이 89개, 음성을 음성으로 예측한 것이 52개가 있다. 이외에 양성으로 예측한 것 중에 음성인 클래스가 1개 있으며, 양성 클래스 이지만 음성으로 예측된 것이 1개 있다. 

In [220]:
# gradient boosting

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state = 0, max_depth=3, learning_rate=0.04)
gbrt.fit(X_train_scaled, y_train)

print(gbrt.score(X_train_scaled, y_train))
print(gbrt.score(X_test_scaled, y_test))

0.9976525821596244
0.965034965034965


In [221]:
# gradient boosting model의 feature importance

gbrt_importance = gbrt.feature_importances_
gbrt_importance

array([1.63928361e-03, 1.84492860e-02, 4.95510619e-04, 1.33948313e-03,
       2.17136109e-03, 1.74427178e-03, 3.29591557e-03, 5.44551054e-01,
       2.25389685e-03, 3.53080334e-04, 2.13195896e-03, 1.86743712e-03,
       5.99120930e-03, 2.17062218e-02, 1.18983903e-03, 1.13938980e-03,
       1.30765223e-03, 0.00000000e+00, 1.88795973e-04, 6.96006784e-03,
       3.40160020e-02, 4.13050608e-02, 1.18168956e-01, 6.85934154e-02,
       3.25436189e-03, 6.96472307e-03, 2.42341726e-02, 7.91439171e-02,
       3.97642663e-03, 1.56724999e-03])

In [222]:
# gradient boosting prediction

pred_gradient_boosting = gbrt.predict(X_test_scaled)
pred_gradient_boosting

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [223]:
# gradient boosting model의 accuracy

from sklearn.metrics import accuracy_score

gb_accuracy = accuracy_score(y_test, pred_gradient_boosting)
gb_accuracy

0.965034965034965

In [224]:
# gradient boosting confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_gradient_boosting)

array([[51,  2],
       [ 3, 87]], dtype=int64)

In [225]:
# gradient boosting confusion matrix report

from sklearn.metrics import classification_report
cf_matrix = classification_report(y_test, pred_gradient_boosting)
print(cf_matrix)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95        53
           1       0.98      0.97      0.97        90

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.97      0.97      0.97       143



In [None]:
# Gradient boosting의 파라미터를 random_state = 0, max_depth=3, learning_rate=0.04로 지정 (최적의 조건인지는 모르겠으나), 훈련데이터와 테스트데이터 간의 격차가 가장 작은 것으로 판단
# precision(정밀도) : tp / tp+fp, 실제 양성을 음성으로 예측한 것이 3개정도 있다.

### xgboost
### LightRGBM

### Stacking