# Application of Ensemble Classifier

## VotingClassifier with Forest CoverType dataset

### Setup

In [None]:
# common lib
import sklearn
import numpy as np

### Datasets


#### Forest CoverType dataset
* Characteristic data of forest covertype
* Predict which type of covertype belongs to
* https://archive.ics.uci.edu/ml/datasets/Covertype 
* $Y$: discrete, 
  * $X_{0 ∼ 9}$: continuous
  * $X_{10 ∼ 53}$: discrete

In [None]:
from sklearn.datasets import fetch_covtype
import pandas as pd

covtype = fetch_covtype()
covtype_X = covtype.data[:,0:10] #continuous features for Gaussian model
covtype_y = covtype.target
covtype_feature_names = covtype.feature_names[0:10]
covtype_class_names = covtype.target_names

print('Number of targets: ',len(covtype_class_names))

# combine features and class data
covtype_np = np.append(covtype_X, covtype_y.reshape(-1,1), axis=1)
covtype_col_names = covtype_feature_names + ['class']

covtype_pd = pd.DataFrame(covtype_np, columns=covtype_col_names)
covtype_pd.head(3)

#### Min, Max, Var, Std of features

In [None]:
covtype_pd.max(axis=0)

In [None]:
covtype_pd.min(axis=0)

In [None]:
covtype_pd.var(axis=0)

### Preprocess

#### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
covtype_X_train, covtype_X_test, covtype_y_train, covtype_y_test = train_test_split(covtype_X, covtype_y, random_state=42)

### VotingClassifier

**-sklearn.ensemble.[VotingClassifier(estimators, *, voting='hard', weights=None, n_jobs=None, flatten_transform=True, verbose=False)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html) : Returns the instance itself.**

Soft Voting/Majority Rule classifier for unfitted estimators.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

clf1 = DecisionTreeClassifier(random_state=1)
clf2 = KNeighborsClassifier()
clf3 = GaussianNB()

estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3)]
vclf = VotingClassifier(estimators=estimators)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

clfs = [clf1, clf2, clf3]
for clf in clfs:
  clf.fit(covtype_X_train, covtype_y_train)
  pred = clf.predict(covtype_X_test)
  class_name = clf.__class__.__name__
  print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(covtype_y_test, pred)))

vclf.fit(covtype_X_train, covtype_y_train)
pred_vot = vclf.predict(covtype_X_test)
print('VotingClassifier의 정확도: {0:.4f}'.format(accuracy_score(covtype_y_test, pred_vot)))

#### Set parameters

**voting**
* default = 'hard'
* ['hard', 'soft']
* If 'hard', uses predicted class labels for majority rule voting. Else if 'soft', predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers.

**weights**
* default=None
* array-like of shape
* Sequence of weights (float or int) to weight the occurrences of predicted class labels (hard voting) or class probabilities before averaging (soft voting). Uses uniform weights if None.

voting 'soft' VS 'hard'

In [None]:
# Default model
from sklearn.ensemble import VotingClassifier
estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3)]
vclf = VotingClassifier(estimators=estimators)

# voting="soft"
vclf.set_params(voting="soft")
vclf.fit(covtype_X_train, covtype_y_train)

from sklearn import metrics

predict = vclf.predict(covtype_X_train)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(voting="soft"): {}'.format(acc))
predict = vclf.predict(covtype_X_test)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(voting="soft"): {}'.format(acc))

# voting="hard"
vclf.set_params(voting="hard")
vclf.fit(covtype_X_train, covtype_y_train)

predict = vclf.predict(covtype_X_train)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(voting="hard"): {}'.format(acc))
predict = vclf.predict(covtype_X_test)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(voting="hard"): {}'.format(acc))

weights none VS [2,3,1]

In [None]:
# Default model
from sklearn.ensemble import VotingClassifier
estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3)]
vclf = VotingClassifier(estimators=estimators)

# weights=none
vclf.set_params()
vclf.fit(covtype_X_train, covtype_y_train)

from sklearn import metrics

predict = vclf.predict(covtype_X_train)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(): {}'.format(acc))
predict = vclf.predict(covtype_X_test)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(): {}'.format(acc))

# weights=[2,3,1]
vclf.set_params(weights=[2,3,1])
vclf.fit(covtype_X_train, covtype_y_train)

predict = vclf.predict(covtype_X_train)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(weights=[2,3,1]): {}'.format(acc))
predict = vclf.predict(covtype_X_test)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(weights=[2,3,1]): {}'.format(acc))

## BaggingClassifier (Bootstrap aggregating classifier) with Forest CoverType dataset

### Setup

In [None]:
# Common imports
import sklearn
import numpy as np
import os

### Visualization with virtual data

#### Generate virtual data

In [None]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

#### Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#### BaggingClassifier with virtual data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# 단일 DecisionTreeClassifier
vir_tree_clf = DecisionTreeClassifier(random_state=42)
vir_tree_clf.fit(X_train, y_train)

# BaggingClassifier
vir_bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=10,
    max_samples=10, bootstrap=True, random_state=42)
vir_bag_clf.fit(X_train, y_train)


In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(vir_tree_clf, X_test, y_test)
plt.title("Decision Tree", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(vir_bag_clf, X_test, y_test)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.ylabel("")
plt.show()

### Datasets



#### Forest CoverType dataset
* Characteristic data of forest covertype
* Predict which type of covertype belongs to
* https://archive.ics.uci.edu/ml/datasets/Covertype 
* $Y$: discrete, 
  * $X_{0 ∼ 9}$: continuous
  * $X_{10 ∼ 53}$: discrete

In [None]:
from sklearn.datasets import fetch_covtype
import pandas as pd

covtype = fetch_covtype()
covtype_X = covtype.data[:,0:10] #continuous features for Gaussian model
covtype_y = covtype.target
covtype_feature_names = covtype.feature_names[0:10]
covtype_class_names = covtype.target_names

print('Number of targets: ',len(covtype_class_names))

# combine features and class data
covtype_np = np.append(covtype_X, covtype_y.reshape(-1,1), axis=1)
covtype_col_names = covtype_feature_names + ['class']

covtype_pd = pd.DataFrame(covtype_np, columns=covtype_col_names)
covtype_pd.head(3)

### Preprocess

#### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
covtype_X_train, covtype_X_test, covtype_y_train, covtype_y_test = train_test_split(covtype_X, covtype_y, random_state=42)

#### Scaling 
##### StandardScaler - 3장 참조
The standard score of a sample x is calculated as:

$z = \frac{(x - u)}{s}$

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
covtype_X_train_scale = scaler.fit_transform(covtype_X_train)
covtype_X_test_scale = scaler.transform(covtype_X_test)

### BaggingClassifier

**-sklearn.ensemble.[BaggingClassifier(base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html) : Returns the instance itself.**

A Bagging classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# 단일 DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(covtype_X_train_scale, covtype_y_train)

# BaggingClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), bootstrap=True, random_state=42)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)


#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

y_pred_train_tree = tree_clf.predict(covtype_X_train_scale)
print('Train Accuracy(one tree): {}'.format(accuracy_score(covtype_y_train, y_pred_train_tree)))
y_pred_test_tree = tree_clf.predict(covtype_X_test_scale)
print('Test Accuracy(one tree): {}'.format(accuracy_score(covtype_y_test, y_pred_test_tree)))

y_pred_train_bag = bag_clf.predict(covtype_X_train_scale)
print('Train Accuracy(bag): {}'.format(accuracy_score(covtype_y_train, y_pred_train_bag)))
y_pred_test_bag = bag_clf.predict(covtype_X_test_scale)
print('Test Accuracy(bag): {}'.format(accuracy_score(covtype_y_test, y_pred_test_bag)))

#### Set parameters

**max_samples**
* float, default=1.0
* The number of samples to draw from X to train each base estimator.
  * If int, then draw max_samples samples.
  * If float, then draw max_samples * X.shape[0] samples.

**n_estimators**
* int, default=10
* The number of base estimators in the ensemble.

**bootstrap_features**
* bool, default=False
* Whether features are drawn with replacement.

max_samples $0.1$ VS $0.4$

In [None]:
# Default model
bag_clf = BaggingClassifier(DecisionTreeClassifier(), bootstrap=True, random_state=42)

# max_samples=0.1
bag_clf.set_params(max_samples=0.1)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

from sklearn import metrics

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_samples=0.1): {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_samples=0.1): {}'.format(acc))

# max_samples=0.4
bag_clf.set_params(max_samples=0.4)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_samples=0.4: {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_samples=0.4): {}'.format(acc))

n_estimators 1 VS 10

In [None]:
# Default model
bag_clf = BaggingClassifier(DecisionTreeClassifier(), bootstrap=True, random_state=42)

# n_estimators=1
bag_clf.set_params(n_estimators=1)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

from sklearn import metrics

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(n_estimators=1): {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(n_estimators=1): {}'.format(acc))

# n_estimators=10
bag_clf.set_params(n_estimators=10)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(n_estimators=10): {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(n_estimators=10): {}'.format(acc))

bootstrap_features True VS False

In [None]:
# Default model
bag_clf = BaggingClassifier(DecisionTreeClassifier(), bootstrap=True, random_state=42)

# bootstrap_features=True
bag_clf.set_params(bootstrap_features=True)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

from sklearn import metrics

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(bootstrap_features=True): {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(bootstrap_features=True): {}'.format(acc))

# bootstrap_features=False
bag_clf.set_params(bootstrap_features=False)
bag_clf.fit(covtype_X_train_scale, covtype_y_train)

predict = bag_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(bootstrap_features=False): {}'.format(acc))
predict = bag_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(bootstrap_features=False): {}'.format(acc))

## RandomForestClassifier with California Housing dataset

### Setup

In [None]:
# common lib
import sklearn
import numpy as np
import os

### Visualization with virtual data

#### Create virtual data

In [None]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

#### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
vir_X_train, vir_X_test, vir_y_train, vir_y_test = train_test_split(X, y, random_state=42)

#### Model for virtual data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

vir_rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
vir_rnd_clf.fit(vir_X_train, vir_y_train)
y_pred_rf = vir_rnd_clf.predict(vir_X_test)

# 단일 DecisionTreeClassifier
vir_tree_clf = DecisionTreeClassifier(random_state=42)
vir_tree_clf.fit(vir_X_train, vir_y_train)

# RandomForestClassifier
vir_rnd_clf = RandomForestClassifier(n_estimators=500, 
    max_leaf_nodes=16, random_state=42)
vir_rnd_clf.fit(vir_X_train, vir_y_train)


In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(vir_tree_clf, vir_X_test, vir_y_test)
plt.title("Decision Tree", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(vir_rnd_clf, vir_X_test, vir_y_test)
plt.title("Decision Trees with Randomforest", fontsize=14)
plt.ylabel("")
plt.show()

### Datasets



#### Forest CoverType dataset
* Characteristic data of forest covertype
* Predict which type of covertype belongs to
* https://archive.ics.uci.edu/ml/datasets/Covertype 
* $Y$: discrete, 
  * $X_{0 ∼ 9}$: continuous
  * $X_{10 ∼ 53}$: discrete

In [None]:
from sklearn.datasets import fetch_covtype
import pandas as pd

covtype = fetch_covtype()
covtype_X = covtype.data[:,0:10] #continuous features for Gaussian model
covtype_y = covtype.target
covtype_feature_names = covtype.feature_names[0:10]
covtype_class_names = covtype.target_names

print('Number of targets: ',len(covtype_class_names))

# combine features and class data
covtype_np = np.append(covtype_X, covtype_y.reshape(-1,1), axis=1)
covtype_col_names = covtype_feature_names + ['class']

covtype_pd = pd.DataFrame(covtype_np, columns=covtype_col_names)
covtype_pd.head(3)

### Preprocess

#### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
covtype_X_train, covtype_X_test, covtype_y_train, covtype_y_test = train_test_split(covtype_X, covtype_y, random_state=42)

#### Scaling 
##### StandardScaler - 3장 참조
The standard score of a sample x is calculated as:

$z = \frac{(x - u)}{s}$

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
covtype_X_train_scale = scaler.fit_transform(covtype_X_train)
covtype_X_test_scale = scaler.transform(covtype_X_test)

### RandomForestClassifier

**-sklearn.ensemble.[RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) : Returns the instance itself.**

A random forest classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(random_state=42, n_estimators=10)
rnd_clf.fit(covtype_X_train_scale, covtype_y_train)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

predict = rnd_clf.predict(covtype_X_train_scale)
print('Train Accuracy(covtype): {}'.format(accuracy_score(covtype_y_train, predict)))
predict = rnd_clf.predict(covtype_X_test_scale)
print('Test Accuracy(covtype): {}'.format(accuracy_score(covtype_y_test, predict)))

#### Set parameters

**max_features**
* default= "sqrt"
* {"sqrt", "log2", None}
* If "sqrt", then max_features=sqrt(n_features).
* If "log2", then max_features=log2(n_features).
* If None, then max_features=n_features.

**max_depth**
* int, default=10
* The number of base estimators in the ensemble.

max_features $log2$ VS $None$

* Train Accuracy(bag): 0.9979484072618121
* Test Accuracy(bag): 0.9492334065389355

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(random_state=42, n_estimators=10)

# max_features="log2"
rnd_clf.set_params(max_features="log2")
rnd_clf.fit(covtype_X_train_scale, covtype_y_train)

from sklearn import metrics

predict = rnd_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_features="log2"): {}'.format(acc))
predict = rnd_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_features="log2"): {}'.format(acc))

# max_features=None
rnd_clf.set_params(max_features=None)
rnd_clf.fit(covtype_X_train_scale, covtype_y_train)

predict = rnd_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_features=None): {}'.format(acc))
predict = rnd_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_features=None): {}'.format(acc))

max_depth $16$ VS $32$

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(random_state=42, n_estimators=10, max_features="log2")

# max_depth=16
rnd_clf.set_params(max_depth=16)
rnd_clf.fit(covtype_X_train_scale, covtype_y_train)

from sklearn import metrics

predict = rnd_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_depth=16): {}'.format(acc))
predict = rnd_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_depth=16): {}'.format(acc))

# max_depth=32
rnd_clf.set_params(max_depth=32)
rnd_clf.fit(covtype_X_train_scale, covtype_y_train)

predict = rnd_clf.predict(covtype_X_train_scale)
acc = metrics.accuracy_score(covtype_y_train, predict)
print('train Accuracy(max_depth=32): {}'.format(acc))
predict = rnd_clf.predict(covtype_X_test_scale)
acc = metrics.accuracy_score(covtype_y_test, predict)
print('Test Accuracy(max_depth=32): {}'.format(acc))

## StackingClassifier with California iris dataset

### Setup

In [None]:
# common lib
import sklearn
import numpy as np

### Datasets


#### Iris dataset

In [None]:
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()
iris_X = iris["data"]
iris_y = iris["target"]
iris_feature_name = iris.feature_names

pd.DataFrame(iris_X, columns=iris_feature_name).head(5)

### Preprocess

#### Splitting

In [None]:
from sklearn.model_selection import train_test_split
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, random_state=42)

### StackingClassifier

**-sklearn.ensemble.[StackingClassifier(estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html) : Returns the instance itself.**

Stack of estimators with a final classifier.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=1)
clf3 = HistGradientBoostingClassifier(random_state=1)
clf4 = AdaBoostClassifier(random_state=1)
clf5 = DecisionTreeClassifier(random_state=1, max_depth=None)

lr = LogisticRegression(random_state=1)

estimators=[('clf1', clf1),
            ('clf2', clf2),
            ('clf3', clf3),
            ('clf4', clf4),
            ('clf5', clf5)]

sclf = StackingClassifier(estimators=estimators,
                          final_estimator=lr,
                          cv=10)

sclf.fit(iris_X_train, iris_y_train)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

print('StackingClassifier Train set 정확도: {0:.4f}'.format(sclf.score(iris_X_train, iris_y_train)))
print('StackingClassifier Test set 정확도: {0:.4f}'.format(sclf.score(iris_X_test, iris_y_test)))

## Exercise

## 1번 문제 (VotingClassifier)

VotingClassifier를 사용하여 breast_cancer 데이터를 분류 하시오.
  * LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier를 모두 VotingClassifier의 base estimator로 사용하시오.
  * parameter voting은 'soft'로 변경 해 accuracy를 측정하시오.


  ```python
  import pandas as pd

  df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                  'machine-learning-databases'
                  '/breast-cancer-wisconsin/wdbc.data', header=None)

  from sklearn.preprocessing import LabelEncoder

  breast_X = df.loc[:, 2:].values
  breast_y = df.loc[:, 1].values
  le = LabelEncoder()
  breast_y = le.fit_transform(breast_y)
  le.classes_

  from sklearn.model_selection import train_test_split
  breast_X_train, breast_X_test, breast_y_train, y_test = train_test_split(breast_X, breast_y, test_size=0.20, stratify=breast_y, random_state=1)
  ```






## 1번 문제 답안

## 2번 문제 (BaggingClassifier)

BaggingClassifier를 사용하여 wine 데이터를 분류 하시오.
  * DecisionTreeClassifier를 base estimator로 사용하시오.
  * validation_curve함수를 활용하여 parameter **max_features**를 조정하여 결과를 시각화하고, accuracy가 가장 높게 측정되는 max_features 값을 도출하시오.
  * wine dataset
  
  ```python
  from sklearn.datasets import load_wine

  wine = load_wine()
  ```

## 2번 문제 답안

## 3번 문제(RandomForestClassifier)

RandomForestClassifier를 사용하여 breast_cancer 데이터를 분류 하시오.

  * Validation_curve 함수를 사용하여 아래 Hyperparameters의 변화에 따른 결과를 그래프로 표현하시오.
    * criterion
    * n_estimators
    * min_samples_leaf
  * 가장 높은 accuracy를 기록하는 파리미터 조합을 도출하시오.

  ```python
  import pandas as pd

  df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                  'machine-learning-databases'
                  '/breast-cancer-wisconsin/wdbc.data', header=None)

  from sklearn.preprocessing import LabelEncoder

  breast_X = df.loc[:, 2:].values
  breast_y = df.loc[:, 1].values
  le = LabelEncoder()
  breast_y = le.fit_transform(breast_y)
  le.classes_

  from sklearn.model_selection import train_test_split
  breast_X_train, breast_X_test, breast_y_train, y_test = train_test_split(breast_X, breast_y, test_size=0.20, stratify=breast_y, random_state=1)
  ```

## 3번 문제 답안