In [7]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_moons, load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [54]:
x, y = make_moons(n_samples=100000, noise=0.15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2019)

In [10]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [11]:
vot_clf = VotingClassifier(estimators=
                          [('lr', log_clf) , ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')

vot_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [12]:
for clf in (log_clf, rnd_clf, svm_clf, vot_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.915
RandomForestClassifier 0.995
SVC 0.995
VotingClassifier 0.995


In [13]:
svm_clf = SVC(probability=True)
vot_clf = VotingClassifier(estimators=
                          [('lr', log_clf) , ('rf', rnd_clf), ('svc', svm_clf)], voting='soft')


In [14]:
for clf in (log_clf, rnd_clf, svm_clf, vot_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.915
RandomForestClassifier 0.995
SVC 0.995
VotingClassifier 0.99


In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)

In [16]:
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

In [17]:
y_pred

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1], dtype=int64)

In [18]:
accuracy_score(y_test, y_pred)

0.985

In [19]:
y_pred_proba = bag_clf.predict_proba(x_test)

In [20]:
y_pred_proba

array([[0.972, 0.028],
       [0.956, 0.044],
       [0.   , 1.   ],
       [0.916, 0.084],
       [0.992, 0.008],
       [0.956, 0.044],
       [0.06 , 0.94 ],
       [1.   , 0.   ],
       [0.998, 0.002],
       [0.988, 0.012],
       [0.   , 1.   ],
       [0.992, 0.008],
       [0.   , 1.   ],
       [0.064, 0.936],
       [1.   , 0.   ],
       [0.946, 0.054],
       [0.96 , 0.04 ],
       [0.634, 0.366],
       [0.034, 0.966],
       [0.012, 0.988],
       [0.008, 0.992],
       [0.   , 1.   ],
       [0.032, 0.968],
       [0.976, 0.024],
       [0.09 , 0.91 ],
       [0.884, 0.116],
       [0.002, 0.998],
       [0.002, 0.998],
       [0.978, 0.022],
       [0.102, 0.898],
       [0.996, 0.004],
       [0.938, 0.062],
       [0.006, 0.994],
       [0.894, 0.106],
       [0.056, 0.944],
       [0.   , 1.   ],
       [0.964, 0.036],
       [0.016, 0.984],
       [0.544, 0.456],
       [0.988, 0.012],
       [1.   , 0.   ],
       [0.014, 0.986],
       [1.   , 0.   ],
       [0.9

In [25]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True)


In [26]:
bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [27]:
bag_clf.oob_score_

0.98375

In [28]:
y_pred = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.985

In [29]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [30]:
rnd_clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [31]:
y_pred_rf = rnd_clf.predict(x_test)

In [33]:
iris = load_iris()

In [34]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [35]:
rnd_clf.fit(iris['data'], iris['target'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [36]:
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10516054712783673
sepal width (cm) 0.025421940979784244
petal length (cm) 0.41526866380811184
petal width (cm) 0.4541488480842672


In [57]:
ada_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=50, algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [58]:
y_pred = ada_clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.98995

In [59]:
from sklearn.tree import DecisionTreeRegressor

In [61]:
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(x, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [62]:
y2 = y - tree_reg1.predict(x)

In [65]:
y

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [64]:
y2

array([ 0.08859537,  0.08859537,  0.08859537, ..., -0.09221285,
       -0.09221285,  0.08859537])

In [63]:
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(x, y2)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [66]:
y3 = y2 - tree_reg2.predict(x)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(x, y3)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [70]:
y_pted = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2 ,tree_reg3))

In [71]:
y_pted

array([-0.05845481,  0.10962335, -0.00167434, ...,  0.86474599,
        0.10962335,  1.04541772])

In [72]:
from sklearn.ensemble import GradientBoostingRegressor

In [73]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(x, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [74]:
from sklearn.metrics import mean_squared_error

In [75]:
x_train, x_val, y_train, y_val = train_test_split(x, y)

In [76]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [77]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(x_val)]

In [78]:
bst_n_estimators = np.argmin(errors) +1

In [79]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)

In [80]:
gbrt_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [82]:
y_pred = gbrt_best.predict(x_val)
mean_squared_error(y_val, y_pred)

0.02255864493617247

In [87]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(x_train, y_train)
    y_pred = gbrt.predict(x_val)
    val_error = mean_squared_error(y_val, y_pred)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break


In [88]:
min_val_error

0.02264392131473066

In [89]:
gbrt.subsample

1.0

In [90]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(x_train, y_train)
y_pred = xgb_reg.predict(x_val)



In [91]:
xgb_reg.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(x_val)

[0]	validation_0-rmse:0.463541
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.431659
[2]	validation_0-rmse:0.403852
[3]	validation_0-rmse:0.379565
[4]	validation_0-rmse:0.358365
[5]	validation_0-rmse:0.339048
[6]	validation_0-rmse:0.322055
[7]	validation_0-rmse:0.306998
[8]	validation_0-rmse:0.293372
[9]	validation_0-rmse:0.280935
[10]	validation_0-rmse:0.269957
[11]	validation_0-rmse:0.259773
[12]	validation_0-rmse:0.251027
[13]	validation_0-rmse:0.242756
[14]	validation_0-rmse:0.235084
[15]	validation_0-rmse:0.227314
[16]	validation_0-rmse:0.220215
[17]	validation_0-rmse:0.213216
[18]	validation_0-rmse:0.20695
[19]	validation_0-rmse:0.201008
[20]	validation_0-rmse:0.195652
[21]	validation_0-rmse:0.190494
[22]	validation_0-rmse:0.185923
[23]	validation_0-rmse:0.181639
[24]	validation_0-rmse:0.177797
[25]	validation_0-rmse:0.174072
[26]	validation_0-rmse:0.170861
[27]	validation_0-rmse:0.167671
[28]	validation_0-rmse:0.164679
[29]	validation_0-r