In [2]:
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from IPython.display import Image
import pydotplus 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
def data(dataset, is_regression, title=None):
    if title is not None:
        print(title.upper())
        print()
    print('data shape:', dataset.data.shape)
    print('target shape:', dataset.target.shape)
    
    alpha=0.3
    
    X = dataset.data
    x_train, x_test, y_train, y_test = train_test_split(X, dataset.target, test_size=0.1, random_state=4)
    print('x_train', x_train[0])
    print('y_train', y_train[0])
    print('x_test', x_test[0])
    print('y_test', y_test[0])
    print()
    
    n_estimators = [100, 200, 300]
    max_depth = [1, 3, 5]
    param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
    
    if is_regression:
        model = RandomForestRegressor(n_estimators=20, criterion='mse', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingRegressor()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = RandomForestClassifier(n_estimators=20, criterion='entropy', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingClassifier()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
    grid_result = grid_search.fit(x_train, y_train)
    print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    if is_regression:
        model = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    print('\n-----------------------------\n')

In [None]:
diabetes = datasets.load_diabetes()
data(diabetes, True, 'diabetes')

breast_cancer = datasets.load_breast_cancer()
data(breast_cancer, True, 'breast_cancer')

boston = datasets.load_boston()
data(boston, True, 'boston')

iris = datasets.load_iris()
data(iris, False, 'iris')

wine = datasets.load_wine()
data(wine, False, 'wine')

digits = datasets.load_digits()
data(digits, False, 'digits')

DIABETES

data shape: (442, 10)
target shape: (442,)
x_train [-0.04547248 -0.04464164 -0.04824063 -0.01944209 -0.00019301 -0.01603186
  0.06704829 -0.03949338 -0.02479119  0.01963284]
y_train 111.0
x_test [-0.04183994 -0.04464164 -0.04931844 -0.03665645 -0.00707277 -0.02260797
  0.08545648 -0.03949338 -0.06648815  0.00720652]
y_test 128.0

RandomForestRegressor:
tree score: 0.30181463546752285
Mean squared error: 3733.72
Feature importance: 
 [0.06135018 0.01153203 0.26149704 0.10501156 0.04692596 0.05251321
 0.04787047 0.02029483 0.32057301 0.0724317 ]

GradientBoostingRegressor:
tree score: 0.4320447381973416
Mean squared error: 3037.28
Feature importance: 
 [0.04478689 0.01708734 0.23620013 0.1125117  0.03084412 0.04193646
 0.04248093 0.02993788 0.3814462  0.06276835]

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.5s finished


Best Accuracy: -3053.300736 using {'max_depth': 1, 'n_estimators': 200}
Best GradientBoostingRegressor:
tree score: 0.4183577988784327
Mean squared error: 3110.47
Feature importance: 
 [0.01900377 0.01885497 0.33536195 0.12393189 0.00474867 0.00863322
 0.04789281 0.01110491 0.37543859 0.05502922]


-----------------------------

BREAST_CANCER

data shape: (569, 30)
target shape: (569,)
x_train [1.026e+01 1.471e+01 6.620e+01 3.216e+02 9.882e-02 9.159e-02 3.581e-02
 2.037e-02 1.633e-01 7.005e-02 3.380e-01 2.509e+00 2.394e+00 1.933e+01
 1.736e-02 4.671e-02 2.611e-02 1.296e-02 3.675e-02 6.758e-03 1.088e+01
 1.948e+01 7.089e+01 3.571e+02 1.360e-01 1.636e-01 7.162e-02 4.074e-02
 2.434e-01 8.488e-02]
y_train 1
x_test [1.442e+01 1.654e+01 9.415e+01 6.412e+02 9.751e-02 1.139e-01 8.007e-02
 4.223e-02 1.912e-01 6.412e-02 3.491e-01 7.706e-01 2.677e+00 3.214e+01
 4.577e-03 3.053e-02 3.840e-02 1.243e-02 1.873e-02 3.373e-03 1.667e+01
 2.151e+01 1.114e+02 8.621e+02 1.294e-01 3.371e-01 3.755e-01 1.414e

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.3s remaining:    0.3s
