In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

df_census = pd.read_csv('./census_cleaned.csv')
X, y = df_census.iloc[:, :-1], df_census.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=2)
clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)




0.813782090652254

**Decision Tree regressor**

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

df_bikes = pd.read_csv('./bike_rentals_cleaned.csv')
X_bikes, y_bikes = df_bikes.iloc[:,:-1], df_bikes.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(
    X_bikes, y_bikes, random_state=2)

reg = DecisionTreeRegressor(random_state=2)
scores = cross_val_score(reg, X_bikes, y_bikes, 
                         scoring='neg_mean_squared_error', cv=5)
rmse = np.sqrt(-scores)
print(f"RMSE mean: {rmse.mean():.2f}")


RMSE mean: 1233.36


In [8]:
from sklearn.metrics import mean_squared_error

reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

reg_mse = mean_squared_error(y_train, y_pred)
reg_mse = np.sqrt(reg_mse)

reg_mse


0.0

In [9]:
from sklearn.model_selection import GridSearchCV
params = {'max_depth': [None,2,3,4,6,8,10,20]}
reg = DecisionTreeRegressor(random_state=2)
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', 
                        cv=5, n_jobs=-1)
grid_reg.fit(X_train, y_train)

best_params = grid_reg.best_params_
print("Best params:", best_params)


Best params: {'max_depth': 6}


In [10]:
best_score =np.sqrt(-grid_reg.best_score_)
print(f"Training score: {best_score:.3f}")


Training score: 951.398


In [11]:
best_model = grid_reg.best_estimator_ 
y_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f"Test score: {rmse_test:.3f}")

Test score: 864.670


In [12]:
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):
  grid_reg = GridSearchCV(reg, params, 
                          scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
  grid_reg.fit(X_train, y_train)
  best_params = grid_reg.best_params_
  print(f"Best params: {best_params}")
  best_score = np.sqrt(-grid_reg.best_score_)
  print(f"Training score: {best_score:.3f}")
  y_pred = grid_reg.predict(X_test)
  rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
  print(f"Test score: {rmse_test:.3f}")

X_train.shape
grid_search(params={'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30]})

Best params: {'min_samples_leaf': 8}
Training score: 896.083
Test score: 855.620


In [13]:
grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],
                    'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best params: {'max_depth': 6, 'min_samples_leaf': 2}
Training score: 870.396
Test score: 913.000


In [14]:
grid_search(params={'max_depth':[6,7,8,9,10],'min_samples_leaf':[3,5,7,9]})

Best params: {'max_depth': 9, 'min_samples_leaf': 7}
Training score: 888.905
Test score: 878.538


**Predicting heart disease -- a case study**

In [15]:
df_heart = pd.read_csv('./heart_disease.csv')
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [16]:
from sklearn.model_selection import train_test_split

X, y = df_heart.iloc[:,:-1], df_heart.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)


In [17]:
model = DecisionTreeClassifier(random_state=2)
scores = cross_val_score(model, X, y, cv=5)
print(f"Accuracy: {np.round(scores, 2)}")
print(f"Accuracy mean: {scores.mean():.2f}")


Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


In [18]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search_clf(params, runs=20, 
                          clf=DecisionTreeClassifier(random_state=2)):
  rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, 
                                n_jobs=-1, random_state=2)
  rand_clf.fit(X_train, y_train)
  best_model = rand_clf.best_estimator_
  best_score = rand_clf.best_score_
  print(f"Training score: {best_score:.3f}")
  y_pred = best_model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Test score: {accuracy:.3f}")
  return best_model

randomized_search_clf(params={
    'criterion':['entropy', 'gini'],
    'splitter':['random', 'best'], 
    'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],
    'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
    'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
    'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
    'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
    'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
    'max_depth':[None, 2,4,6,8],
    'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]})
  

Training score: 0.798
Test score: 0.855


In [19]:
randomized_search_clf(params={
    'max_depth':[None, 6, 7],
    'max_features':['auto', 0.78], 
    'max_leaf_nodes':[45, None], 
    'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
    'min_samples_split':[2, 9, 10],
    'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],}, runs=100)

Training score: 0.802
Test score: 0.868


In [22]:
model = DecisionTreeClassifier(
    class_weight=None, criterion='gini', 
    max_depth=7, max_features=0.78, max_leaf_nodes=45, 
    min_impurity_decrease=0.0, 
    min_samples_leaf=0.045, min_samples_split=9, 
    min_weight_fraction_leaf=0.06, random_state=2)
scores = cross_val_score(model, X, y, cv=5)
print(f"Accuracy: {np.round(scores, 2)}")
print(f"Accuracy mean: {scores.mean():.2f}")

Accuracy: [0.82 0.9  0.8  0.8  0.78]
Accuracy mean: 0.82


In [23]:
best_clf = DecisionTreeClassifier(
    class_weight=None, criterion='gini', 
    max_depth=9,max_features=0.8, max_leaf_nodes=47, 
    min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=8, 
    min_weight_fraction_leaf=0.05, random_state=2, splitter='best')
best_clf.fit(X, y)

In [24]:
best_clf.feature_importances_

array([0.04830121, 0.04008887, 0.47546568, 0.        , 0.        ,
       0.        , 0.        , 0.00976578, 0.        , 0.02445397,
       0.02316427, 0.1774694 , 0.20129082])

In [25]:
import operator
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]


[('cp', 0.47546567857183675),
 ('thal', 0.20129082387838435),
 ('ca', 0.1774694042213901)]