# 使用sklearn解決分類問題：wine資料集
資料切分 --> 資料集成 --> 模型集成

### 載入package及資料切分

In [1]:
# 載入所需package
from sklearn import datasets, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import numpy as np
# 讀取資料集
wine = datasets.load_wine()
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.20, random_state=4)
x_train = np.array(x_train, dtype = int) # sklearn默認整數型，故須調整
x_test = np.array(x_test, dtype = int)
y_train = np.array(y_train, dtype = int)
y_test = np.array(y_test, dtype = int)

## 一、資料面的集成
決策樹(Decision Tree)、隨機森林(Random Forest)、梯度提升機(Gradient Boosting Machine)

### 決策樹(Decision Tree)

In [2]:
# 建立模型
dt = DecisionTreeClassifier()

# 訓練模型
dt.fit(x_train, y_train)

# 預測測試集
dt_pred = dt.predict(x_test)

In [3]:
acc = metrics.accuracy_score(y_test, dt_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.8888888888888888


In [4]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [5]:
print("Feature importance: ", dt.feature_importances_)

Feature importance:  [0.01724609 0.         0.         0.07577828 0.05572716 0.07753082
 0.02569828 0.         0.         0.02874348 0.         0.30250884
 0.41676703]


### 隨機森林(Random Forest)－Bagging

In [6]:
# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
rf = RandomForestClassifier(criterion='gini', n_estimators=20, max_features="auto", max_depth=10, min_samples_split=2, min_samples_leaf=1)

# 訓練模型
rf.fit(x_train, y_train)

# 預測測試集
rf_pred = rf.predict(x_test)

In [7]:
acc = metrics.accuracy_score(y_test, rf_pred)
print("Accuracy: ", acc)

Accuracy:  0.9722222222222222


In [8]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [9]:
print("Feature importance: ", rf.feature_importances_)

Feature importance:  [0.16360144 0.0246827  0.00548946 0.05750955 0.05368809 0.03545283
 0.1315707  0.         0.02676569 0.15854896 0.03485629 0.14089888
 0.1669354 ]


### 梯度提升機(Gradient Boosting Machine)－Boosting

In [10]:
# 建立模型
gdbt = GradientBoostingClassifier(loss="deviance", learning_rate=0.5, n_estimators=100)

# 訓練模型
gdbt.fit(x_train, y_train)

# 預測測試集
gdbt_pred = gdbt.predict(x_test)

In [11]:
acc = metrics.accuracy_score(y_test, gdbt_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.9166666666666666


In [12]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [13]:
print("Feature importance: ", gdbt.feature_importances_)

Feature importance:  [5.05513742e-03 5.14337708e-03 3.71102534e-03 3.57947534e-02
 4.33739974e-02 1.53649578e-02 8.15489482e-02 0.00000000e+00
 4.78700699e-03 2.20813696e-01 1.37850761e-04 1.90539027e-01
 3.93730223e-01]


## 二、 模型面的集成
混合泛化(Blending)、堆疊泛化(Stacking)

### 混合泛化(Blending)

In [14]:
blending_pred = dt_pred*0.01 + rf_pred*0.8 + gdbt_pred*0.19

def correct_pred(pred):
    c =[]
    for i in pred:
        if i < 0.5:
            c.append(0)
        elif i < 1.5:
            c.append(1)
        else:
            c.append(2)
    return c

correct_blending_pred = correct_pred(blending_pred)

In [15]:
acc = metrics.accuracy_score(y_test, correct_blending_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.9722222222222222


### 堆疊泛化(Stacking)

In [16]:
from mlxtend.classifier import StackingClassifier # 堆疊泛化所需套件

meta_estimator = GradientBoostingClassifier(n_estimators=100, max_depth=4, learning_rate=0.1)
stacking = StackingClassifier(classifiers=[dt, rf, gdbt], meta_classifier=meta_estimator)
stacking.fit(x_train, y_train)
stacking_pred = stacking.predict(x_test)

In [17]:
acc = metrics.accuracy_score(y_test, stacking_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.9166666666666666
