In [1]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rf_clf = RandomForestClassifier()
log_reg = LogisticRegression()
svc_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('svc', svc_clf), ('rf', rf_clf)],
    voting='hard'
)

# voting_clf.fit()

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1
)

# bagging_clf.fit()

In [3]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)

# bag_clf.fit(X_train, y_train)

# bag_clf.oob_score_

In [4]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for feature_name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(feature_name, score)

sepal length (cm) 0.09663604721081537
sepal width (cm) 0.024196703696314114
petal length (cm) 0.437443685866201
petal width (cm) 0.44172356322666945


In [5]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(iris['data'], iris['target'])

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.)
gbr.fit(iris['data'], iris['target'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [7]:
# early stopping grbt (version one)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(iris['data'], iris['target'])
grbt = GradientBoostingRegressor(n_estimators=120, max_depth=2)
grbt.fit(X_train, y_train)

errors = [mean_squared_error(y_pred, y_val) for y_pred in grbt.staged_predict(X_val)]

bst_n_estimators = np.argmin(errors)

bst_n_estimators

117

In [8]:
gbrt_best = GradientBoostingRegressor(n_estimators=bst_n_estimators, max_depth=2)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=117, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
errors

[0.5632032550160624,
 0.46184241364052525,
 0.3862319554112242,
 0.3191288477387752,
 0.26872091113646723,
 0.2227785534880258,
 0.18917047092860612,
 0.15855840845275893,
 0.13608280337371625,
 0.1149324249226932,
 0.09779456269696617,
 0.08605211529838076,
 0.07455956372344147,
 0.065052347978884,
 0.05836210953255422,
 0.05142243491611492,
 0.046354286761749464,
 0.041569542077169896,
 0.03855227923308373,
 0.03343105214120294,
 0.03092240137595436,
 0.028861242459896453,
 0.026783445477729227,
 0.025051820795507055,
 0.021950989606536134,
 0.021206081890592445,
 0.01874919030360085,
 0.018293151886651527,
 0.01774024734067138,
 0.017344387286743784,
 0.016716882404269485,
 0.016493851602678945,
 0.016361914951853043,
 0.01605371054197404,
 0.01604874059300435,
 0.015141723921870144,
 0.014616706237978245,
 0.014133061724196193,
 0.013753233360149962,
 0.01361472055959721,
 0.013579781041174141,
 0.013500652140225819,
 0.013432857531999921,
 0.013203917363713019,
 0.0131304709059417

In [10]:
import matplotlib.pyplot as plt

def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

min_error = np.min(errors)
plt.figure(figsize=(11, 4))

plt.subplot(111)
plt.plot(errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 1])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

plt.show()

<Figure size 1100x400 with 1 Axes>

In [11]:
# early stopping gbrt v2
gbrt = GradientBoostingRegressor(warm_start=True, max_depth=2)

min_val_error = float('inf')
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

In [12]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.899578
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:0.815086
[2]	validation_0-rmse:0.742584
[3]	validation_0-rmse:0.674698
[4]	validation_0-rmse:0.615569
[5]	validation_0-rmse:0.559435
[6]	validation_0-rmse:0.508837
[7]	validation_0-rmse:0.465294
[8]	validation_0-rmse:0.425826
[9]	validation_0-rmse:0.390379
[10]	validation_0-rmse:0.358455
[11]	validation_0-rmse:0.329723
[12]	validation_0-rmse:0.303566
[13]	validation_0-rmse:0.28031
[14]	validation_0-rmse:0.261198
[15]	validation_0-rmse:0.242348
[16]	validation_0-rmse:0.226185
[17]	validation_0-rmse:0.210195
[18]	validation_0-rmse:0.197414
[19]	validation_0-rmse:0.184128
[20]	validation_0-rmse:0.174961
[21]	validation_0-rmse:0.167517
[22]	validation_0-rmse:0.159222
[23]	validation_0-rmse:0.154001
[24]	validation_0-rmse:0.147558
[25]	validation_0-rmse:0.143702
[26]	validation_0-rmse:0.138723
[27]	validation_0-rmse:0.135699
[28]	validation_0-rmse:0.13311
[29]	validation_0-rm

In [13]:
# implementing singular value decompositon
if False:
    X_centered = X - X.mean(axis=0)
    U, s, Vt = np.linalg.svd(X_centered)
    c1 = Vt.T[:, 0]
    c2 = Vt.T[:, 1]

    W2 = Vt.T[:, :2]
    X2D = X_centered.dot(W2)

In [15]:
from sklearn.decomposition import IncrementalPCA

n_batches = 10
inc_pca = IncrementalPCA(n_components=2)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X_train)

In [16]:
X_reduced

array([[ 1.84842608e+00, -1.78117933e-01],
       [ 1.80507047e+00,  1.32927623e-01],
       [-1.58314757e-02, -7.14252204e-01],
       [ 1.95905172e+00,  4.69410795e-01],
       [-2.46190301e+00, -1.93076378e-01],
       [ 5.15678775e-01, -6.55181741e-01],
       [-1.28277634e-01, -2.30578834e-01],
       [ 7.55280816e-01,  1.38649064e-01],
       [ 1.43509392e+00, -2.13471155e-01],
       [-2.36616045e+00,  1.98336199e-01],
       [ 2.36682621e+00, -2.34591192e-01],
       [ 1.25727906e+00,  4.11475113e-01],
       [ 8.57360417e-01, -1.34656196e-01],
       [ 1.48790499e+00, -1.29016187e-01],
       [-2.93152159e+00, -5.06497879e-01],
       [ 1.02835125e+00, -1.07019158e-01],
       [ 2.31188526e+00,  4.06986464e-01],
       [ 3.15907719e-01, -6.01361715e-01],
       [ 1.93995981e+00,  2.32515028e-01],
       [-3.17599112e+00,  1.31283391e-01],
       [ 1.46460762e+00, -5.07761912e-01],
       [ 2.18290885e+00,  2.60684387e-01],
       [ 3.83458526e+00,  2.69843494e-01],
       [-2.

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA

clf = Pipeline([
    ('kpca', KernelPCA(n_components=2)),
    ('log_reg', LogisticRegression())
])

param_grid = [{
    'kpca__gamma': np.linspace(0.03, 0.05, 10),
    'kpca__kernel': ['rbf', 'sigmoid']
}]

grid_search = GridSearchCV(clf, param_grid, cv=3, return_train_score=True)
grid_search.fit(X_train, y_train)





GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=2, n_jobs=None,
     random_state=None, remove_zero_eig=False, tol=0)), ('log_reg', LogisticRe...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kpca__gamma': array([0.03   , 0.03222, 0.03444, 0.03667, 0.03889, 0.04111, 0.04333,
       0.04556, 0.04778, 0.05   ]), 'kpca__kernel': ['rbf', 'sigmoid']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [23]:
import pandas as pd

pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kpca__gamma,param_kpca__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.004407,0.001501,0.001144,0.000141,0.03,rbf,"{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}",0.868421,0.868421,0.916667,0.883929,0.022532,10,0.905405,0.878378,0.934211,0.905998,0.022797
1,0.002707,0.000582,0.002329,0.002266,0.03,sigmoid,"{'kpca__gamma': 0.03, 'kpca__kernel': 'sigmoid'}",0.657895,0.657895,0.638889,0.651786,0.008876,11,0.648649,0.648649,0.631579,0.642959,0.008047
2,0.002269,0.000137,0.00081,3.7e-05,0.0322222,rbf,"{'kpca__gamma': 0.03222222222222222, 'kpca__ke...",0.868421,0.894737,0.916667,0.892857,0.019647,8,0.918919,0.878378,0.934211,0.910503,0.023557
3,0.002097,9.5e-05,0.000648,7e-06,0.0322222,sigmoid,"{'kpca__gamma': 0.03222222222222222, 'kpca__ke...",0.657895,0.657895,0.638889,0.651786,0.008876,11,0.648649,0.635135,0.618421,0.634068,0.012363
4,0.00222,9.4e-05,0.000812,2.1e-05,0.0344444,rbf,"{'kpca__gamma': 0.034444444444444444, 'kpca__k...",0.868421,0.894737,0.916667,0.892857,0.019647,8,0.918919,0.891892,0.921053,0.910621,0.013272
5,0.00198,7.2e-05,0.000688,6.3e-05,0.0344444,sigmoid,"{'kpca__gamma': 0.034444444444444444, 'kpca__k...",0.631579,0.631579,0.638889,0.633929,0.003414,13,0.648649,0.635135,0.605263,0.629682,0.018127
6,0.002341,0.000241,0.000813,1.9e-05,0.0366667,rbf,"{'kpca__gamma': 0.03666666666666667, 'kpca__ke...",0.868421,0.947368,0.944444,0.919643,0.036725,7,0.932432,0.891892,0.921053,0.915126,0.017073
7,0.002087,0.000107,0.000667,1.6e-05,0.0366667,sigmoid,"{'kpca__gamma': 0.03666666666666667, 'kpca__ke...",0.631579,0.631579,0.638889,0.633929,0.003414,13,0.635135,0.635135,0.592105,0.620792,0.020284
8,0.002203,4.2e-05,0.000799,1.4e-05,0.0388889,rbf,"{'kpca__gamma': 0.03888888888888889, 'kpca__ke...",0.868421,0.973684,0.944444,0.928571,0.044711,1,0.945946,0.891892,0.907895,0.915244,0.022671
9,0.002059,8e-05,0.000655,2e-06,0.0388889,sigmoid,"{'kpca__gamma': 0.03888888888888889, 'kpca__ke...",0.631579,0.631579,0.638889,0.633929,0.003414,13,0.635135,0.621622,0.592105,0.616287,0.017967


In [24]:
print(grid_search.best_params_)

{'kpca__gamma': 0.03888888888888889, 'kpca__kernel': 'rbf'}


In [None]:
# Locally Linear Embedding
