# ENSEMBLE LEARNING AND RANDOM FORESTS

### VOTING CLASSIFIERS

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [3]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples = 1000, noise = 0.15)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.8)



In [4]:
voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rnd', rnd_clf), ('svm', svm_clf)], voting = 'hard')

In [5]:
voting_clf.fit(X_train, y_train);



In [6]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.89
RandomForestClassifier 0.99
SVC 0.98
VotingClassifier 0.975




### BAGGING AND PASTING

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500, max_samples = 100, bootstrap = True, n_jobs = -1)
bag_clf.fit(X_train, y_train)
y_pred2 = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred2)

0.97

In [8]:
bag_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500, max_samples = 100, bootstrap = True, n_jobs = -1, oob_score=True)

In [9]:
bag_clf2.fit(X_train, y_train);

In [10]:
bag_clf2.oob_score_

0.97625

In [11]:
y_pred2 = bag_clf2.predict(X_test)
accuracy_score(y_test, y_pred2)

0.965

In [12]:
bag_clf2.oob_decision_function_

array([[0.        , 1.        ],
       [0.025     , 0.975     ],
       [0.        , 1.        ],
       ...,
       [0.        , 1.        ],
       [0.04597701, 0.95402299],
       [0.99308756, 0.00691244]])

### RANDOM FORESTS

In [13]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred_rf)

0.985

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf2 = RandomForestClassifier(n_estimators=500, n_jobs = -1)
rnd_clf2.fit(iris["data"], iris["target"])
for name,score in zip(iris["feature_names"], rnd_clf2.feature_importances_):
    print(name, score)

sepal length (cm) 0.0985914634518347
sepal width (cm) 0.0251349089061179
petal length (cm) 0.43665898802045694
petal width (cm) 0.4396146396215905


### BOOSTING

#### ADABOOST

In [15]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1), n_estimators = 200, algorithm="SAMME.R", learning_rate = 0.5)
ada_clf.fit(X_train, y_train)
y_pred3 = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred3)

0.99

#### GRADIENT BOOSTING

In [16]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X_train, y_train);

In [17]:
y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2);

In [18]:
y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X_train, y3);

In [19]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
gbrt.fit(X_train, y_train);

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X_train, y_train);

In [22]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1
gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train);

In [23]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

In [24]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [25]:
xgb_reg.fit(X_train, y_train, eval_set= [(X_val, y_val)], early_stopping_rounds = 2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.464458
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.433027
[2]	validation_0-rmse:0.406363
[3]	validation_0-rmse:0.383298
[4]	validation_0-rmse:0.363734
[5]	validation_0-rmse:0.347213
[6]	validation_0-rmse:0.32864
[7]	validation_0-rmse:0.316453
[8]	validation_0-rmse:0.301971
[9]	validation_0-rmse:0.292926
[10]	validation_0-rmse:0.280816
[11]	validation_0-rmse:0.269926
[12]	validation_0-rmse:0.260442
[13]	validation_0-rmse:0.251584
[14]	validation_0-rmse:0.243945
[15]	validation_0-rmse:0.235011
[16]	validation_0-rmse:0.228301
[17]	validation_0-rmse:0.219819
[18]	validation_0-rmse:0.214114
[19]	validation_0-rmse:0.206985
[20]	validation_0-rmse:0.201547
[21]	validation_0-rmse:0.194875
[22]	validation_0-rmse:0.190788
[23]	validation_0-rmse:0.18572
[24]	validation_0-rmse:0.182551
[25]	validation_0-rmse:0.178323
[26]	validation_0-rmse:0.175231
[27]	validation_0-rmse:0.171135
[28]	validation_0-rmse:0.168195
[29]	validation_0-rm