In [2]:
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from IPython.display import Image
import pydotplus 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
def data(dataset, is_regression, title=None):
    if title is not None:
        print(title.upper())
        print()
    print('data shape:', dataset.data.shape)
    print('target shape:', dataset.target.shape)
    
    alpha=0.3
    
    X = dataset.data
    x_train, x_test, y_train, y_test = train_test_split(X, dataset.target, test_size=0.1, random_state=4)
    print('x_train', x_train[0])
    print('y_train', y_train[0])
    print('x_test', x_test[0])
    print('y_test', y_test[0])
    print()
    
    n_estimators = [100, 200, 300]
    max_depth = [1, 3, 5]
    param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
    
    if is_regression:
        model = RandomForestRegressor(n_estimators=20, criterion='mse', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingRegressor()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = RandomForestClassifier(n_estimators=20, criterion='entropy', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingClassifier()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
    grid_result = grid_search.fit(x_train, y_train)
    print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    if is_regression:
        model = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    print('\n-----------------------------\n')

In [4]:
diabetes = datasets.load_diabetes()
data(diabetes, True, 'diabetes')

breast_cancer = datasets.load_breast_cancer()
data(breast_cancer, True, 'breast_cancer')

boston = datasets.load_boston()
data(boston, True, 'boston')

iris = datasets.load_iris()
data(iris, False, 'iris')

wine = datasets.load_wine()
data(wine, False, 'wine')

digits = datasets.load_digits()
data(digits, False, 'digits')

DIABETES

data shape: (442, 10)
target shape: (442,)
x_train [-0.04547248 -0.04464164 -0.04824063 -0.01944209 -0.00019301 -0.01603186
  0.06704829 -0.03949338 -0.02479119  0.01963284]
y_train 111.0
x_test [-0.04183994 -0.04464164 -0.04931844 -0.03665645 -0.00707277 -0.02260797
  0.08545648 -0.03949338 -0.06648815  0.00720652]
y_test 128.0

RandomForestRegressor:
tree score: 0.30181463546752285
Mean squared error: 3733.72
Feature importance: 
 [0.06135018 0.01153203 0.26149704 0.10501156 0.04692596 0.05251321
 0.04787047 0.02029483 0.32057301 0.0724317 ]

GradientBoostingRegressor:
tree score: 0.4240911282497296
Mean squared error: 3079.81
Feature importance: 
 [0.04455432 0.01708734 0.23697335 0.10959025 0.0302271  0.04312663
 0.04200541 0.02995365 0.38309154 0.06339041]

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.4s finished


Best Accuracy: -3053.300736 using {'max_depth': 1, 'n_estimators': 200}
Best GradientBoostingRegressor:
tree score: 0.4183577988784326
Mean squared error: 3110.47
Feature importance: 
 [0.01900377 0.01885497 0.33536195 0.12393189 0.00474867 0.00863322
 0.04789281 0.01110491 0.37543859 0.05502922]


-----------------------------

BREAST_CANCER

data shape: (569, 30)
target shape: (569,)
x_train [1.026e+01 1.471e+01 6.620e+01 3.216e+02 9.882e-02 9.159e-02 3.581e-02
 2.037e-02 1.633e-01 7.005e-02 3.380e-01 2.509e+00 2.394e+00 1.933e+01
 1.736e-02 4.671e-02 2.611e-02 1.296e-02 3.675e-02 6.758e-03 1.088e+01
 1.948e+01 7.089e+01 3.571e+02 1.360e-01 1.636e-01 7.162e-02 4.074e-02
 2.434e-01 8.488e-02]
y_train 1
x_test [1.442e+01 1.654e+01 9.415e+01 6.412e+02 9.751e-02 1.139e-01 8.007e-02
 4.223e-02 1.912e-01 6.412e-02 3.491e-01 7.706e-01 2.677e+00 3.214e+01
 4.577e-03 3.053e-02 3.840e-02 1.243e-02 1.873e-02 3.373e-03 1.667e+01
 2.151e+01 1.114e+02 8.621e+02 1.294e-01 3.371e-01 3.755e-01 1.414e

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.3s finished


Best Accuracy: -0.041768 using {'max_depth': 3, 'n_estimators': 300}
Best GradientBoostingRegressor:
tree score: 0.5689235582841411
Mean squared error: 0.09
Feature importance: 
 [9.21133850e-04 1.25683013e-02 8.17468009e-04 1.61589133e-03
 6.28339767e-04 1.00021798e-03 6.18666121e-03 2.55631312e-02
 2.53668034e-04 7.81583479e-04 3.87505855e-03 1.62929945e-02
 6.38669759e-03 1.85964737e-02 1.39265855e-03 1.67121561e-03
 7.24476759e-04 2.18185323e-04 8.99808595e-04 2.82960564e-03
 4.87571326e-01 2.71960646e-02 2.03934833e-01 5.78968862e-02
 4.11740185e-03 1.94764164e-03 6.27419422e-03 1.05028746e-01
 2.34618795e-03 4.63147755e-04]


-----------------------------

BOSTON

data shape: (506, 13)
target shape: (506,)
x_train [  2.44953   0.       19.58      0.        0.605     6.402    95.2
   2.2625    5.      403.       14.7     330.04     11.32   ]
y_train 22.3
x_test [2.1124e-01 1.2500e+01 7.8700e+00 0.0000e+00 5.2400e-01 5.6310e+00
 1.0000e+02 6.0821e+00 5.0000e+00 3.1100e+02 1.5200e+0

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.2s finished


Best Accuracy: -10.502883 using {'max_depth': 3, 'n_estimators': 200}
Best GradientBoostingRegressor:
tree score: 0.8806923344638872
Mean squared error: 9.80
Feature importance: 
 [0.03709937 0.00061264 0.00452532 0.00051468 0.02368249 0.39008894
 0.01207443 0.08886996 0.00185529 0.01369968 0.03386394 0.0108056
 0.38230767]


-----------------------------

IRIS

data shape: (150, 4)
target shape: (150,)
x_train [4.9 3.1 1.5 0.2]
y_train 0
x_test [6.4 2.8 5.6 2.1]
y_test 2

RandomForestClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.15747511 0.0236137  0.38716535 0.43174583]

GradientBoostingClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.00559435 0.01241956 0.33461218 0.64737392]

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.5s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.1s finished


Best Accuracy: -0.029630 using {'max_depth': 3, 'n_estimators': 100}
Best GradientBoostingClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.00672485 0.01068803 0.32566154 0.65692559]


-----------------------------

WINE

data shape: (178, 13)
target shape: (178,)
x_train [1.229e+01 2.830e+00 2.220e+00 1.800e+01 8.800e+01 2.450e+00 2.250e+00
 2.500e-01 1.990e+00 2.150e+00 1.150e+00 3.300e+00 2.900e+02]
y_train 1
x_test [1.296e+01 3.450e+00 2.350e+00 1.850e+01 1.060e+02 1.390e+00 7.000e-01
 4.000e-01 9.400e-01 5.280e+00 6.800e-01 1.750e+00 6.750e+02]
y_test 2

RandomForestClassifier:
tree score: 1.0
r2_score: 1.00
accuracy_score: 1.00
Feature importance: 
 [0.13266505 0.03108169 0.01650162 0.01432874 0.03607685 0.03370329
 0.26265153 0.02781129 0.03232458 0.1715406  0.03054127 0.09990714
 0.11086635]

GradientBoostingClassifier:
tree score: 1.0
r2_score: 1.00
accuracy_score: 1.00
Feature importance: 
 [1.66876165e-02 3.95174257e-02 4

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.2s finished


Best Accuracy: -0.062500 using {'max_depth': 1, 'n_estimators': 100}
Best GradientBoostingClassifier:
tree score: 0.9444444444444444
r2_score: 0.92
accuracy_score: 0.94
Feature importance: 
 [8.08213083e-02 5.67652352e-03 5.61963941e-05 5.46589162e-03
 1.72473616e-03 0.00000000e+00 2.14348165e-01 0.00000000e+00
 2.62028084e-04 2.56295302e-01 7.52035487e-02 5.45975833e-02
 3.05548717e-01]


-----------------------------

DIGITS

data shape: (1797, 64)
target shape: (1797,)
x_train [ 0.  1.  7. 14. 16. 12.  1.  0.  0.  7. 16.  9.  6. 11.  1.  0.  0. 11.
 12.  4.  1.  0.  0.  0.  0. 12. 16. 16. 15.  6.  0.  0.  0.  3.  9.  4.
 11. 12.  0.  0.  0.  0.  0.  0.  8. 16.  0.  0.  0.  0.  0.  0. 14. 13.
  0.  0.  0.  0.  6. 16. 15.  3.  0.  0.]
y_train 5
x_test [ 0.  0.  0. 11. 16. 12.  1.  0.  0.  0.  5. 16. 10. 16.  4.  0.  0.  2.
 15. 10.  0.  8.  1.  0.  0.  5. 16.  9.  1.  0.  0.  0.  0.  8. 16. 16.
  9.  0.  0.  0.  0.  2. 16. 10. 16.  6.  0.  0.  0.  0. 11. 16. 16.  7.
  0.  0.  0.  0.  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   15.2s finished


Best Accuracy: -0.868275 using {'max_depth': 3, 'n_estimators': 200}
Best GradientBoostingClassifier:
tree score: 0.9611111111111111
r2_score: 0.98
accuracy_score: 0.96
Feature importance: 
 [0.00000000e+00 4.94422103e-04 1.12905263e-02 5.97111121e-03
 2.30630166e-03 5.85343399e-02 3.43951187e-03 2.73495678e-03
 3.01184720e-04 1.08747602e-03 1.59793654e-02 3.95022912e-04
 7.48924369e-03 1.27849176e-02 3.26179028e-03 8.30094990e-04
 1.39591762e-04 2.13729109e-03 1.13032670e-02 3.02499949e-02
 2.66267112e-02 8.96790798e-02 4.58373843e-03 0.00000000e+00
 2.14549591e-04 1.70361112e-03 4.92674462e-02 1.83385499e-02
 3.43007151e-02 2.35333529e-02 8.78787166e-03 5.80072431e-04
 0.00000000e+00 6.86390614e-02 1.98075807e-03 6.21287263e-03
 7.24718210e-02 1.11127950e-02 1.78132176e-02 0.00000000e+00
 0.00000000e+00 8.44776747e-03 8.30048195e-02 7.02037338e-02
 8.80881022e-03 1.90602409e-02 2.25387975e-02 0.00000000e+00
 0.00000000e+00 7.14893616e-04 3.99853424e-03 1.89328222e-02
 1.04762555e-02 