2019 AI讀書會 Ian Fan ianfan0704@gamil.com

In [1]:
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from IPython.display import Image
import pydotplus 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def data(dataset, is_regression, title=None):
    if title is not None:
        print(title.upper())
        print()
    print('data shape:', dataset.data.shape)
    print('target shape:', dataset.target.shape)
    
    alpha=0.3
    
    X = dataset.data
    x_train, x_test, y_train, y_test = train_test_split(X, dataset.target, test_size=0.1, random_state=4)
    print('x_train', x_train[0])
    print('y_train', y_train[0])
    print('x_test', x_test[0])
    print('y_test', y_test[0])
    print()
    
    n_estimators = [100, 200, 300]
    max_depth = [1, 3, 5]
    param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
    
    if is_regression:
        model = RandomForestRegressor(n_estimators=20, criterion='mse', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingRegressor()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = RandomForestClassifier(n_estimators=20, criterion='entropy', max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=0)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('RandomForestClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance:\n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
        model = GradientBoostingClassifier()
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
        
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
    grid_result = grid_search.fit(x_train, y_train)
    print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    if is_regression:
        model = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingRegressor:')
        print('tree score:', model.score(x_test,y_test))
        print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
        print("Feature importance: \n", model.feature_importances_)
        print()
    else:
        model = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                                   n_estimators=grid_result.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print('Best GradientBoostingClassifier:')
        print('tree score:', model.score(x_test,y_test))
        print("r2_score: %.2f"% r2_score(y_test, y_pred))
        print('accuracy_score: %.2f'% accuracy_score(y_test, y_pred))
#         df = pd.DataFrame(model.feature_importances_, index=dataset.feature_names, columns=['importance'])
#         print("Feature importance: \n", df.sort_values('importance', ascending=False))
        print("Feature importance: \n", model.feature_importances_)
        print()
    print('\n-----------------------------\n')

In [3]:
diabetes = datasets.load_diabetes()
data(diabetes, True, 'diabetes')

breast_cancer = datasets.load_breast_cancer()
data(breast_cancer, True, 'breast_cancer')

boston = datasets.load_boston()
data(boston, True, 'boston')

iris = datasets.load_iris()
data(iris, False, 'iris')

wine = datasets.load_wine()
data(wine, False, 'wine')

digits = datasets.load_digits()
data(digits, False, 'digits')

DIABETES

data shape: (442, 10)
target shape: (442,)
x_train [-0.04547248 -0.04464164 -0.04824063 -0.01944209 -0.00019301 -0.01603186
  0.06704829 -0.03949338 -0.02479119  0.01963284]
y_train 111.0
x_test [-0.04183994 -0.04464164 -0.04931844 -0.03665645 -0.00707277 -0.02260797
  0.08545648 -0.03949338 -0.06648815  0.00720652]
y_test 128.0

RandomForestRegressor:
tree score: 0.30181463546752285
Mean squared error: 3733.72
Feature importance: 
 [0.06135018 0.01153203 0.26149704 0.10501156 0.04692596 0.05251321
 0.04787047 0.02029483 0.32057301 0.0724317 ]

GradientBoostingRegressor:
tree score: 0.43238224651926027
Mean squared error: 3035.48
Feature importance: 
 [0.04464849 0.01712984 0.23499887 0.10958183 0.0305749  0.04291715
 0.04282445 0.02964702 0.38404296 0.06363449]

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.8s finished


Best Accuracy: -3053.300736 using {'max_depth': 1, 'n_estimators': 200}
Best GradientBoostingRegressor:
tree score: 0.4183577988784326
Mean squared error: 3110.47
Feature importance: 
 [0.01900377 0.01885497 0.33536195 0.12393189 0.00474867 0.00863322
 0.04789281 0.01110491 0.37543859 0.05502922]


-----------------------------

BREAST_CANCER

data shape: (569, 30)
target shape: (569,)
x_train [1.026e+01 1.471e+01 6.620e+01 3.216e+02 9.882e-02 9.159e-02 3.581e-02
 2.037e-02 1.633e-01 7.005e-02 3.380e-01 2.509e+00 2.394e+00 1.933e+01
 1.736e-02 4.671e-02 2.611e-02 1.296e-02 3.675e-02 6.758e-03 1.088e+01
 1.948e+01 7.089e+01 3.571e+02 1.360e-01 1.636e-01 7.162e-02 4.074e-02
 2.434e-01 8.488e-02]
y_train 1
x_test [1.442e+01 1.654e+01 9.415e+01 6.412e+02 9.751e-02 1.139e-01 8.007e-02
 4.223e-02 1.912e-01 6.412e-02 3.491e-01 7.706e-01 2.677e+00 3.214e+01
 4.577e-03 3.053e-02 3.840e-02 1.243e-02 1.873e-02 3.373e-03 1.667e+01
 2.151e+01 1.114e+02 8.621e+02 1.294e-01 3.371e-01 3.755e-01 1.414e

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.2s finished


Best Accuracy: -0.041894 using {'max_depth': 3, 'n_estimators': 300}
Best GradientBoostingRegressor:
tree score: 0.5934306204405688
Mean squared error: 0.08
Feature importance: 
 [1.12589696e-03 1.32235236e-02 8.09257884e-04 1.71860045e-03
 5.62098895e-04 1.54166971e-03 6.18191717e-03 2.63797178e-02
 3.60004790e-04 7.36618518e-04 1.84273058e-03 1.64099530e-02
 7.00185148e-03 1.73143086e-02 1.49519653e-03 1.78933053e-03
 9.12522632e-04 2.43515147e-04 3.18408295e-04 2.84912802e-03
 4.84966597e-01 2.78711225e-02 2.00961943e-01 6.27549069e-02
 4.04414058e-03 1.36244040e-03 6.27106562e-03 1.05279204e-01
 3.10098022e-03 5.71349533e-04]


-----------------------------

BOSTON

data shape: (506, 13)
target shape: (506,)
x_train [  2.44953   0.       19.58      0.        0.605     6.402    95.2
   2.2625    5.      403.       14.7     330.04     11.32   ]
y_train 22.3
x_test [2.1124e-01 1.2500e+01 7.8700e+00 0.0000e+00 5.2400e-01 5.6310e+00
 1.0000e+02 6.0821e+00 5.0000e+00 3.1100e+02 1.5200e+0

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    2.6s finished


Best Accuracy: -10.525892 using {'max_depth': 3, 'n_estimators': 200}
Best GradientBoostingRegressor:
tree score: 0.8804457518969605
Mean squared error: 9.82
Feature importance: 
 [0.03366558 0.00055499 0.00488386 0.00051485 0.02732962 0.39005954
 0.01209982 0.08815945 0.00187218 0.01353749 0.03090166 0.01409584
 0.38232511]


-----------------------------

IRIS

data shape: (150, 4)
target shape: (150,)
x_train [4.9 3.1 1.5 0.2]
y_train 0
x_test [6.4 2.8 5.6 2.1]
y_test 2

RandomForestClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.15747511 0.0236137  0.38716535 0.43174583]

GradientBoostingClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.0071185  0.0070159  0.29589237 0.68997324]

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.4s finished


Best Accuracy: -0.029630 using {'max_depth': 3, 'n_estimators': 100}
Best GradientBoostingClassifier:
tree score: 0.9333333333333333
r2_score: 0.92
accuracy_score: 0.93
Feature importance: 
 [0.0048078  0.00799151 0.16203432 0.82516637]


-----------------------------

WINE

data shape: (178, 13)
target shape: (178,)
x_train [1.229e+01 2.830e+00 2.220e+00 1.800e+01 8.800e+01 2.450e+00 2.250e+00
 2.500e-01 1.990e+00 2.150e+00 1.150e+00 3.300e+00 2.900e+02]
y_train 1
x_test [1.296e+01 3.450e+00 2.350e+00 1.850e+01 1.060e+02 1.390e+00 7.000e-01
 4.000e-01 9.400e-01 5.280e+00 6.800e-01 1.750e+00 6.750e+02]
y_test 2

RandomForestClassifier:
tree score: 1.0
r2_score: 1.00
accuracy_score: 1.00
Feature importance: 
 [0.13266505 0.03108169 0.01650162 0.01432874 0.03607685 0.03370329
 0.26265153 0.02781129 0.03232458 0.1715406  0.03054127 0.09990714
 0.11086635]

GradientBoostingClassifier:
tree score: 1.0
r2_score: 1.00
accuracy_score: 1.00
Feature importance: 
 [0.00830588 0.03928192 0.0070934

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.8s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.9s finished


Best Accuracy: -0.056250 using {'max_depth': 1, 'n_estimators': 100}
Best GradientBoostingClassifier:
tree score: 0.9444444444444444
r2_score: 0.92
accuracy_score: 0.94
Feature importance: 
 [8.34856114e-02 5.50484974e-03 4.76309653e-05 5.24945732e-03
 1.43486242e-03 0.00000000e+00 2.17025850e-01 0.00000000e+00
 1.48037606e-04 2.53767668e-01 7.99843016e-02 5.62609991e-02
 2.97090732e-01]


-----------------------------

DIGITS

data shape: (1797, 64)
target shape: (1797,)
x_train [ 0.  1.  7. 14. 16. 12.  1.  0.  0.  7. 16.  9.  6. 11.  1.  0.  0. 11.
 12.  4.  1.  0.  0.  0.  0. 12. 16. 16. 15.  6.  0.  0.  0.  3.  9.  4.
 11. 12.  0.  0.  0.  0.  0.  0.  8. 16.  0.  0.  0.  0.  0.  0. 14. 13.
  0.  0.  0.  0.  6. 16. 15.  3.  0.  0.]
y_train 5
x_test [ 0.  0.  0. 11. 16. 12.  1.  0.  0.  0.  5. 16. 10. 16.  4.  0.  0.  2.
 15. 10.  0.  8.  1.  0.  0.  5. 16.  9.  1.  0.  0.  0.  0.  8. 16. 16.
  9.  0.  0.  0.  0.  2. 16. 10. 16.  6.  0.  0.  0.  0. 11. 16. 16.  7.
  0.  0.  0.  0.  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   31.0s finished


Best Accuracy: -0.855906 using {'max_depth': 3, 'n_estimators': 100}
Best GradientBoostingClassifier:
tree score: 0.9666666666666667
r2_score: 0.96
accuracy_score: 0.97
Feature importance: 
 [0.00000000e+00 5.47935212e-04 1.32185418e-02 5.85420823e-03
 2.12887625e-03 5.82977964e-02 4.19636918e-03 1.87503103e-03
 3.38658926e-04 1.51138863e-03 1.60257858e-02 2.41551417e-04
 7.66206396e-03 1.28924446e-02 3.42771446e-03 9.04319748e-04
 8.59586179e-05 2.14086313e-03 1.11095153e-02 3.08262111e-02
 2.63679785e-02 9.05025386e-02 4.64810694e-03 3.74485654e-09
 3.21166637e-04 1.90115153e-03 4.87507387e-02 1.78785395e-02
 3.37362015e-02 2.47147351e-02 9.17101902e-03 4.83643975e-04
 0.00000000e+00 6.78342886e-02 2.00635136e-03 6.62463997e-03
 7.17561254e-02 1.10855694e-02 1.71283391e-02 0.00000000e+00
 0.00000000e+00 8.59655895e-03 8.33118371e-02 7.10608071e-02
 7.69320050e-03 1.86863774e-02 2.21038796e-02 1.24763716e-09
 0.00000000e+00 6.81359141e-04 3.97128311e-03 1.86681940e-02
 1.04461751e-02 