# 8번 문제


In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [2]:
# train, validation, test 세 가지로 나누려면 train_test_split 두번 실행
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=1110)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=1110)

In [4]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=1110)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=1110)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=1110)

In [5]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf]
for estimator in estimators:
    print("Training estimator : ", estimator)
    estimator.fit(X_train, y_train)

Training estimator :  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1110,
                       verbose=0, warm_start=False)
Training estimator :  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10

In [6]:
scores = []

for estimator in estimators :
  scores.append(estimator.score(X_val,y_val))

print(scores)

[0.9681, 0.9726, 0.843]


ExtraTreeClassifier 의 성능이 가장 좋고 linear SVM의 성능이 눈에 띄게 떨어짐.

In [8]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf)
]

In [9]:
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

0.9691

VotingClassifier를 이용한 결과 역시 linear SVM과 많은 차이가 보이므로, linear SVM을 제거한 후 성능 측정

In [19]:
voting_clf.set_params(svm_clf=None)
voting_clf.estimators

[('random_forest_clf',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=1110,
                         verbose=0, warm_start=False)),
 ('extra_trees_clf',
  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_frac

In [20]:
voting_clf.estimators_
# 튜플형 자료라 LinearSVC가 바뀌지 않은 것을 알 수 있음.

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=1110,
                        verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jo

In [21]:
del voting_clf.estimators_[2]

In [22]:
voting_clf.score(X_val, y_val)

0.9689

In [23]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9727

soft voting의 결과가 더 좋음. (개별 classifier 중에 성능이 가장 좋았던 ExtraTreesClassifier와 비슷한 결과)

In [26]:
# soft voting을 활용하여 test셋 성능 측정
voting_clf.voting = "soft"
voting_clf.score(X_test, y_test)

0.9704

In [28]:
# soft voting과 비슷한 score를 가진 ExtraTreesClassifier를 활용하여 test셋 성능 측정
extra_trees_clf.score(X_test,y_test)

0.9716

테스트셋 결과를 비교해보면 ExtraTreesClassifier의 성능이 더 좋음

# 9번 문제

In [30]:
import numpy as np
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [31]:
X_val_predictions

array([[4., 4., 4.],
       [0., 0., 0.],
       [3., 3., 3.],
       ...,
       [0., 0., 0.],
       [2., 2., 8.],
       [3., 3., 3.]], dtype=float32)

In [37]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=1110)
rnd_forest_blender.fit(X_val_predictions, y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=1110,
                       verbose=0, warm_start=False)

In [38]:
rnd_forest_blender.oob_score_

0.9699

In [39]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)    

In [40]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [41]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9686

여전히 ExtraTreesClassifier의 성능이 가장 좋음