In [None]:
%matplotlib inline


# Comparing randomized search and grid search for hyperparameter estimation


Compare randomized search and grid search for optimizing hyperparameters of a
random forest.
All parameters that influence the learning are searched simultaneously
(except for the number of estimators, which poses a time / quality tradeoff).

The randomized search and the grid search explore exactly the same space of
parameters. The result in parameter settings is quite similar, while the run
time for randomized search is drastically lower.

The performance is slightly worse for the randomized search, though this
is most likely a noise effect and would not carry over to a held-out test set.

Note that in practice, one would not search over this many different parameters
simultaneously using grid search, but pick only the ones deemed most important.


In [1]:
from data_preprocessing import loadData,getScaledAndOneHotEncoderedX,getLineFromFile,simpleScale,testModelOnData,scaleWithFeaturesAndKeepLocation
from data_preprocessing import checkNegative
from data_preprocessing import decisionTreeDemo


X, y, TX, Ty = loadData()

scaledAndOneHotX = getScaledAndOneHotEncoderedX(X) 
scaledAndOneHotTX = getScaledAndOneHotEncoderedX(TX) 
print(scaledAndOneHotX.shape)

continuous_features = list(map(lambda x: x - 1, [1, 3, 5, 11, 12, 13]))
scaledX = scaleWithFeaturesAndKeepLocation(X, continuous_features)
scaledTX = scaleWithFeaturesAndKeepLocation(TX, continuous_features)
print(X.shape, scaledX.shape)

X12 = X[:3]
scaledX12 = scaledX[:3]
print(X12[:,continuous_features])
print(scaledX12[:,continuous_features])

32561 15
16281 15
(32561, 105)
(32561, 14) (32561, 14)
[[  3.90000000e+01   7.75160000e+04   1.30000000e+01   2.17400000e+03
    0.00000000e+00   4.00000000e+01]
 [  5.00000000e+01   8.33110000e+04   1.30000000e+01   0.00000000e+00
    0.00000000e+00   1.30000000e+01]
 [  3.80000000e+01   2.15646000e+05   9.00000000e+00   0.00000000e+00
    0.00000000e+00   4.00000000e+01]]
[[ 0.03067056 -1.06361075  1.13473876  0.1484529  -0.21665953 -0.03542945]
 [ 0.83710898 -1.008707    1.13473876 -0.14592048 -0.21665953 -2.22215312]
 [-0.04264203  0.2450785  -0.42005962 -0.14592048 -0.21665953 -0.03542945]]


In [4]:
print(__doc__)

import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
# digits = load_digits()
# X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_grid = {
                'n_estimators':sp_randint(50,110),
                "max_depth": sp_randint(5,12),
                "max_features": ['sqrt','log2'],
                "min_samples_split": sp_randint(2, 10),
                "min_samples_leaf": sp_randint(2, 10),
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"],
                'oob_score':[True],
             }

# run randomized search
n_iter_search = 1000
# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
#                                    n_iter=n_iter_search)
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv=5,
                    scoring='f1', #see above
                    error_score=0, # to avoid crash
                    n_iter=n_iter_search,
                   n_jobs=5)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {
                'n_estimators':[50,110],
                "max_depth": [5,9,11,None],
                "max_features": ['sqrt','log2'],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [2, 3, 10],
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"],
                'oob_score':[True],
             }


# run grid search
# grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5,
                    scoring='f1', #see above
                    error_score=0, # to avoid crash
                   n_jobs=5)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Automatically created module for IPython interactive environment
RandomizedSearchCV took 1345.55 seconds for 1000 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.913 (std: 0.002)
Parameters: {'max_depth': 11, 'min_samples_split': 6, 'n_estimators': 53, 'criterion': 'gini', 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 'sqrt', 'oob_score': True}

Model with rank: 2
Mean validation score: 0.913 (std: 0.003)
Parameters: {'criterion': 'gini', 'min_samples_split': 7, 'max_depth': 11, 'n_estimators': 77, 'min_samples_leaf': 3, 'bootstrap': True, 'max_features': 'log2', 'oob_score': True}

Model with rank: 3
Mean validation score: 0.912 (std: 0.003)
Parameters: {'criterion': 'gini', 'max_depth': 11, 'n_estimators': 74, 'min_samples_split': 6, 'bootstrap': True, 'max_features': 'log2', 'min_samples_leaf': 3, 'oob_score': True}

GridSearchCV took 899.73 seconds for 576 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.914 (std: 0.00

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(oob_score=True, criterion= 'gini', n_estimators= 73, bootstrap= True, min_samples_split= 6, min_samples_leaf= 4, max_depth=10, max_features='sqrt')
testModelOnData(clf, X, y, TX, Ty)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)
print(scores)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=4,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=73, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.82      0.56      0.67      7841
        1.0       0.87      0.96      0.92     24720

avg / total       0.86      0.87      0.86     32561

[[ 4418  3423]
 [  953 23767]]


             precision    recall  f1-score   support

        0.0       0.80      0.54      0.65      3846
        1.0       0.87      0.96      0.91     12435

avg / total       0.85      0.86      0.85     16281

[[ 2087  1759]
 [  533 11902]]


[ 0.85424728  0.86032799  0.86114438]
[ 0.85337018  0.85657248  0.85964373  0.86179361  0.86056511]


In [3]:
# 'oob_score': True, 'criterion': 'gini', 'n_estimators': 75, 'bootstrap': True, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_depth': 11, 'max_features': 'log2'
from sklearn.ensemble import RandomForestClassifier                              
clf = RandomForestClassifier(oob_score=True, criterion= 'gini', n_estimators= 75, bootstrap= True, min_samples_split= 9, min_samples_leaf= 2, max_depth=11, max_features='log2')
testModelOnData(clf, X, y, TX, Ty)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)
print(scores)                                

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=9, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.84      0.58      0.68      7841
        1.0       0.88      0.96      0.92     24720

avg / total       0.87      0.87      0.86     32561

[[ 4528  3313]
 [  889 23831]]


             precision    recall  f1-score   support

        0.0       0.80      0.55      0.65      3846
        1.0       0.87      0.96      0.91     12435

avg / total       0.86      0.86      0.85     16281

[[ 2110  1736]
 [  528 11907]]


[ 0.85590566  0.8615257   0.86317147]
[ 0.85413788  0.8585688   0.86210074  0.86486486  0.86210074]


In [5]:
feature_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country']
class_names = ["gt 50K","le 50K"]
print(len(feature_names))


14


In [6]:
'''
Usage:
    saveTree2DotAndPdf(clf, feature_names=feature_names, class_names=class_names)
'''
def saveTree2DotAndPdf(clf, feature_names, class_names, file_name='tmp', showImage=False):
    from sklearn import tree
    dotFile = file_name+".dot"
    pdfFile = file_name+".pdf"
    with open(dotFile, 'w') as f:
        f = tree.export_graphviz(clf, out_file=f,
                            feature_names=feature_names,
                            class_names=class_names,
                            filled=True, 
                            rounded=True,  
                            special_characters=True
                            )
    print('save to dot file done\n')
    import pydotplus 
    # dot_data = tree.export_graphviz(clf, out_file=None) 
    # graph = pydotplus.graph_from_dot_data(dot_data) 
    graph = pydotplus.graph_from_dot_file(dotFile)
    graph.write_pdf(pdfFile)
    print('save to pdf file done\n')

    
    if showImage:
        print('prepring to show the image, may take a long time...\n')
        from IPython.display import Image
        Image(graph.create_png())
        

In [18]:
'''
ATTENTION: IT IS TOTALLY WRONG
please research the decisiontree for results.
'''

from sklearn import tree


model = tree.DecisionTreeClassifier(criterion= 'gini', min_samples_split= 9, min_samples_leaf= 2, max_depth=11, max_features='log2')
testModelOnData(model, X, y, TX, Ty)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=9, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       0.75      0.63      0.69      7841
        1.0       0.89      0.93      0.91     24720

avg / total       0.86      0.86      0.86     32561

[[ 4955  2886]
 [ 1636 23084]]


             precision    recall  f1-score   support

        0.0       0.72      0.61      0.66      3846
        1.0       0.88      0.93      0.91     12435

avg / total       0.85      0.85      0.85     16281

[[ 2337  1509]
 [  891 11544]]


[ 0.84512622  0.84236226  0.84999539]


In [17]:
saveTree2DotAndPdf(model, feature_names=feature_names, class_names=class_names, file_name='final_results')

save to dot file done

save to pdf file done

