In [1]:
random_seed = 0

- Using the wisdom of the crowd


# 7.1 Voting Classifier

- Ensemble works well when models are independent and prediction errors are uncorrelated
- **VotingClassifier** copy and train all of the original estimators
- Original estimators can be referenced with **estimators\_** property


### 7.1.1 Hard Voting


In [None]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=random_seed)),
        ('rf', RandomForestClassifier(random_state=random_seed)),
        ('svc', SVC(random_state=random_seed))
    ]
)
# Fit the model
voting_clf.fit(X_train, y_train)

# Score for each model
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))
print('Voting Clasifier =', voting_clf.score(X_test, y_test))    

print("Voting Classifier Prediction: ", voting_clf.predict(X_test[:1]))

print("Prediction for each model in Voting Classifier", [clf.predict(X_test[:1]) for clf in voting_clf.estimators_])

lr = 0.864
rf = 0.896
svc = 0.896
Voting Clasifier = 0.904
Voting Classifier Prediction:  [1]
Prediction for each model in Voting Classifier [array([1]), array([1]), array([0])]


### 7.1.2 Soft Voting

- If all estimators can generate the probability, we can use the soft voting which predict with average probability


In [7]:
voting_clf.voting = "soft"
# set True since svc doesn't return the probability. Will will perform cv to generate the probability
voting_clf.named_estimators['svc'].probability = True

voting_clf.fit(X_train,y_train)
print('Voting Clasifier with soft voting', voting_clf.score(X_test, y_test))    

Voting Clasifier with soft voting 0.912


# 7.2 Bagging and Pasting

**Method to perform ensemble**

- Use multiple models
- Use multiple subset of the data with a single model
  - **Bagging(Draw w/ replacement)**: Usually preferred since it lowers the variance
  - **Pasting(Draw w/o replacement)**


### 7.2.1 Bagging

- n_jobs: # of cores to use
- bootstrap: False if want to pasting
- max_samples: # of sample to train for each estimator


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42, bootstrap = True)
bag_clf.fit(X_train, y_train)

### 7.2.2 OOB

- For bagging classifier, when bagging sample size is equal to the train set, about 37% of the data is not used for training. We can use it to evaluate the model


In [None]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            oob_score=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)

# oob accuracy
bag_clf.oob_score_

# Desicision function of the first 3 OOB samples, which is the probability 
bag_clf.oob_decision_function_[:3]  # probas for the first 3 instances

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

# 7.4 Random Forest

- Decision tree with bagging or pasting


In [None]:
from sklearn.ensemble import RandomForestClassifier


rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                                 n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

### 7.4.2 Feature Importance

- RF measures the feature importance by measuring the average impurity reduction for each feature
- scikit learn measure the feature importance and normalize by sum of the importance is 1


In [11]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris.data, iris.target)
for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(round(score, 2), name)

0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


# 7.5 Boosting

| **특징**               | **AdaBoost**              | **Gradient Boosting (GBM)** | **XGBoost**           | **LightGBM**           |
| ---------------------- | ------------------------- | --------------------------- | --------------------- | ---------------------- |
| **기본 아이디어**      | 틀린 샘플에 가중치 증가   | 잔차(Residual) 최소화       | GBM + 최적화          | GBM + 속도/메모리 개선 |
| **속도**               | 느림                      | 중간                        | 빠름 (병렬 처리 지원) | 매우 빠름              |
| **과적합 방지**        | 규제 제한적 (노이즈 민감) | 기본적으로 규제 약함        | L1, L2 정규화 지원    | 리프 제한 및 정규화    |
| **결측값 처리**        | 별도 전처리 필요          | 별도 전처리 필요            | 자동 처리             | 자동 처리              |
| **메모리 효율**        | 적음                      | 중간                        | 중간                  | 높음                   |
| **병렬 처리**          | 지원 안 함                | 지원 안 함                  | 지원                  | 지원                   |
| **데이터 크기 적합성** | 소규모 데이터 적합        | 소~중규모 데이터 적합       | 대규모 데이터 적합    | 대규모 데이터 적합     |


### 7.5.1 AdaBoost


In [12]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=30,
    learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)



### 7.5.2 Gradient Boosting

- reducing learning rate requires more tree, but the proformance can be better -> call **shrinkage**
- **n_iter_no_change**: early stopping parameter(limits the n_estimators)
- **subsample**: ratio of train data to train each estimator if 1, use the whole dataset


In [16]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)  # y = 3x² + Gaussian noise

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3,
                                 learning_rate=1.0, n_iter_no_change = 10, random_state=random_seed)
gbrt.fit(X, y)