## Decision Tree (CART)

### Required Library

In [2]:
import numpy as np
import pandas as pd
import os

#For test-train split
from sklearn.cross_validation import train_test_split

#For fitting classification tree
from sklearn.tree import DecisionTreeClassifier

#For fitting regression tree
from sklearn.tree import DecisionTreeRegressor

#To create a confusion matrix
from sklearn.metrics import confusion_matrix

#from sklearn import tree



## Classification Tree

### Data

In [3]:
#DATA
#Setting the working directory
os.chdir("C:/Users/Gourab/Documents")
#Reading the data
iris = pd.read_csv("iris.csv")

In [4]:
#The first few observtions of the data
iris.head()

Unnamed: 0.1,Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


### Separating the Target and the Predictors

In [5]:
X = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']]
y = iris[['Species']]

### Train-Test Split (Stratified Sampling of Y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, stratify=y, random_state = 100)

In [15]:
#Checks
#Proportion in training data
y_train.Species.value_counts()/len(y_train)

virginica     0.333333
versicolor    0.333333
setosa        0.333333
Name: Species, dtype: float64

In [16]:
#Proportion in test data
y_test.Species.value_counts()/len(y_test)

versicolor    0.333333
virginica     0.333333
setosa        0.333333
Name: Species, dtype: float64

### Decision tree classifier with criterion gini index

In [20]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)

clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')

In [None]:
#Decision Tree
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')

### Classifying new Observations

In [21]:
#Making Prediction
y_pred = clf_gini.predict(X_test)
y_pred

array(['setosa', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'setosa', 'versicolor', 'setosa', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'virginica', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'setosa', 'setosa', 'setosa', 'versicolor', 'setosa', 'setosa',
       'setosa', 'versicolor', 'virginica'], dtype=object)

### Confusion Matrix

In [22]:
c = confusion_matrix(y_test, y_pred)
c

array([[15,  0,  0],
       [ 0, 15,  0],
       [ 0,  3, 12]])

### Accuracy

In [None]:
#Overall Accuracy
(c[0,0]+c[1,1]+c[2,2])/np.sum(c)*100

In [None]:
#Sensitivity (considering setosa as positive and rest as negatives)
15/(15+0+0)*100


In [None]:
#Specificity
(15+12)/(0+15+0+0+3+12)*100


In [None]:
#Recall
15/(15+0+0)*100

## Regression Tree

In [23]:
#Setting the working directory
os.chdir("C:/Users/Gourab/Desktop/R")
#Reading the data
cars = pd.read_csv("cars.csv")

In [24]:
X = cars[['Cylinders','Weight','Horsepower']]
y = cars[['MPG']]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [26]:
model = DecisionTreeRegressor(max_depth=3, min_samples_leaf=5)
model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [None]:
#help(DecisionTreeRegressor)

In [27]:
y_pred = model.predict(X_test)
y_pred

array([19.2483871 , 32.89833333, 25.4452381 , 25.4452381 , 25.74545455,
       25.4452381 , 19.2483871 , 25.4452381 , 19.2483871 , 25.4452381 ,
       32.89833333, 11.7       , 11.7       , 32.89833333, 25.4452381 ,
       25.4452381 , 28.51071429, 19.2483871 , 25.74545455, 25.4452381 ,
       25.4452381 , 25.4452381 , 25.4452381 , 28.51071429, 19.2483871 ,
       15.12564103, 19.2483871 , 19.2483871 , 25.4452381 , 32.89833333,
       19.2483871 , 19.2483871 , 25.74545455, 32.89833333, 15.12564103,
       32.89833333, 19.2483871 , 25.4452381 , 19.2483871 , 11.7       ,
       25.74545455, 32.89833333, 19.2483871 , 19.2483871 , 32.89833333,
       25.4452381 , 15.12564103, 32.89833333, 25.4452381 , 19.2483871 ,
       32.89833333, 28.91111111, 25.74545455, 11.7       , 11.7       ,
       19.2483871 , 11.7       , 32.89833333, 19.2483871 , 15.12564103,
       19.2483871 , 19.2483871 , 32.89833333, 19.2483871 , 19.2483871 ,
       25.4452381 , 32.89833333, 32.89833333, 32.89833333, 32.89

In [None]:
type(y_test)

In [None]:
type(y_pred)

In [None]:
np.sqrt(np.mean((np.array(y_test)-y_pred)**2)) #RMSE