In [1]:
%matplotlib inline

# Tweak decision tree with GridSearchCV and RandomizedSearchCV

In [2]:
from data_preprocessing import loadData,getScaledAndOneHotEncoderedX,getLineFromFile,simpleScale,testModelOnData,scaleWithFeaturesAndKeepLocation
from data_preprocessing import checkNegative
from data_preprocessing import decisionTreeDemo


X, y, TX, Ty = loadData()

scaledAndOneHotX = getScaledAndOneHotEncoderedX(X) 
scaledAndOneHotTX = getScaledAndOneHotEncoderedX(TX) 
print(scaledAndOneHotX.shape)

continuous_features = list(map(lambda x: x - 1, [1, 3, 5, 11, 12, 13]))
scaledX = scaleWithFeaturesAndKeepLocation(X, continuous_features)
scaledTX = scaleWithFeaturesAndKeepLocation(TX, continuous_features)
print(X.shape, scaledX.shape)

X12 = X[:3]
scaledX12 = scaledX[:3]
print(X12[:,continuous_features])
print(scaledX12[:,continuous_features])

32561 15
16281 15
(32561, 105)
(32561, 14) (32561, 14)
[[  3.90000000e+01   7.75160000e+04   1.30000000e+01   2.17400000e+03
    0.00000000e+00   4.00000000e+01]
 [  5.00000000e+01   8.33110000e+04   1.30000000e+01   0.00000000e+00
    0.00000000e+00   1.30000000e+01]
 [  3.80000000e+01   2.15646000e+05   9.00000000e+00   0.00000000e+00
    0.00000000e+00   4.00000000e+01]]
[[ 0.03067056 -1.06361075  1.13473876  0.1484529  -0.21665953 -0.03542945]
 [ 0.83710898 -1.008707    1.13473876 -0.14592048 -0.21665953 -2.22215312]
 [-0.04264203  0.2450785  -0.42005962 -0.14592048 -0.21665953 -0.03542945]]


In [3]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier


# build a classifier
clf = DecisionTreeClassifier()


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_grid = {
                "criterion": ["gini", "entropy"],
                "splitter": ["best","random"],
                "max_features": ["sqrt","log2",None],
                "max_depth": sp_randint(5,15),
                "min_samples_split": sp_randint(2, 10),
                "min_samples_leaf": sp_randint(1, 10),
                "class_weight": [None, "balanced"],
                "presort": [True, False],
             }

# run randomized search
n_iter_search = 1000
# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
#                                    n_iter=n_iter_search)
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv=5,
                    scoring='f1', #see above
                    error_score=0, # to avoid crash
                    n_iter=n_iter_search,
                   n_jobs=6)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 126.07 seconds for 1000 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.910 (std: 0.002)
Parameters: {'splitter': 'best', 'max_depth': 8, 'max_features': None, 'criterion': 'entropy', 'min_samples_leaf': 5, 'min_samples_split': 4, 'presort': True, 'class_weight': None}

Model with rank: 2
Mean validation score: 0.909 (std: 0.002)
Parameters: {'splitter': 'best', 'max_depth': 8, 'max_features': None, 'criterion': 'entropy', 'min_samples_leaf': 4, 'min_samples_split': 3, 'presort': False, 'class_weight': None}

Model with rank: 3
Mean validation score: 0.909 (std: 0.002)
Parameters: {'splitter': 'best', 'max_depth': 8, 'min_samples_leaf': 7, 'criterion': 'entropy', 'max_features': None, 'min_samples_split': 4, 'presort': True, 'class_weight': None}



In [None]:
# use a full grid over all parameters
param_grid = {
                "criterion": ["gini", "entropy"],
                "splitter": ["best","random"],
#                 "max_features": 
                "max_depth": list(range(5,13))[::2],
                "min_samples_split": list(range(2,13))[::2],
                "min_samples_leaf": list(range(1,11))[::2],
                "class_weight": [None, "balanced"],
                "presort": [True, False],
             }


# run grid search
# grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5,
                    scoring='f1', #see above
                    error_score=0, # to avoid crash
                   n_jobs=1)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

In [5]:
feature_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country']
class_names = ["gt 50K","le 50K"]
print(len(feature_names))

14


In [6]:
'''
Usage:
    saveTree2DotAndPdf(clf, feature_names=feature_names, class_names=class_names)
'''
def saveTree2DotAndPdf(clf, feature_names, class_names, file_name='tmp', showImage=False):
    from sklearn import tree
    dotFile = file_name+".dot"
    pdfFile = file_name+".pdf"
    with open(dotFile, 'w') as f:
        f = tree.export_graphviz(clf, out_file=f,
                            feature_names=feature_names,
                            class_names=class_names,
                            filled=True, 
                            rounded=True,  
                            special_characters=True
                            )
    print('save to dot file done\n')
    import pydotplus 
    # dot_data = tree.export_graphviz(clf, out_file=None) 
    # graph = pydotplus.graph_from_dot_data(dot_data) 
    graph = pydotplus.graph_from_dot_file(dotFile)
    graph.write_pdf(pdfFile)
    print('save to pdf file done\n')

    
    if showImage:
        print('prepring to show the image, may take a long time...\n')
        from IPython.display import Image
        Image(graph.create_png())

In [8]:
'''
Plot the answers.
'''

from sklearn import tree

# 'splitter': 'best', 'max_depth': 8, 'max_features': None, 'criterion': 'entropy', 'min_samples_leaf': 5, 'min_samples_split': 4, 'presort': True, 'class_weight': None
model = tree.DecisionTreeClassifier(criterion= 'entropy', min_samples_split=4, min_samples_leaf=5, max_depth=8,presort=True,max_features=None)
testModelOnData(model, X, y, TX, Ty)
saveTree2DotAndPdf(model, feature_names=feature_names, class_names=class_names, file_name='final_results_tree')

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            presort=True, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       0.82      0.54      0.65      7841
        1.0       0.87      0.96      0.91     24720

avg / total       0.86      0.86      0.85     32561

[[ 4210  3631]
 [  916 23804]]


             precision    recall  f1-score   support

        0.0       0.81      0.52      0.63      3846
        1.0       0.87      0.96      0.91     12435

avg / total       0.85      0.86      0.85     16281

[[ 2011  1835]
 [  481 11954]]


[ 0.85203612  0.8531417   0.85773519]
save to dot file done

save to pdf file done



In [9]:
#{{'splitter': 'best', 'max_depth': 8, 'max_features': None, 'criterion': 'entropy', 'min_samples_leaf': 4, 'min_samples_split': 3, 'presort': False, 'class_weight': None}
from sklearn import tree

model = tree.DecisionTreeClassifier(criterion= 'entropy', min_samples_split=3, min_samples_leaf=4, max_depth=8,presort=False)
testModelOnData(model, X, y, TX, Ty)
saveTree2DotAndPdf(model, feature_names=feature_names, class_names=class_names, file_name='final_results_tree2')

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=4,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       0.82      0.54      0.65      7841
        1.0       0.87      0.96      0.91     24720

avg / total       0.86      0.86      0.85     32561

[[ 4216  3625]
 [  916 23804]]


             precision    recall  f1-score   support

        0.0       0.81      0.52      0.63      3846
        1.0       0.87      0.96      0.91     12435

avg / total       0.85      0.86      0.85     16281

[[ 2011  1835]
 [  484 11951]]


[ 0.85295744  0.85351023  0.85745877]
save to dot file done

save to pdf file done

