# Decision Trees in scikit-learn
Using the `DecisionTreeClassifier` in scikit-learn.  

In [5]:
import pandas as pd
from graphviz import Source    # Note that you need this package
from sklearn.tree import DecisionTreeClassifier, export_graphviz   
apears = pd.read_csv('../data/ApplesPears.csv')
apears.head()

FileNotFoundError: [Errno 2] File ../data/ApplesPears.csv does not exist: '../data/ApplesPears.csv'

scikit-learn can deal with a category class label but it cannot deal with category features.  
So we drop the `Taste` feature. 

In [2]:
y = apears.pop('Class').values   # extract out the labels
apears.pop('Taste')    # Can't deal with category features
ap_features = apears.columns
X = apears.values
X[0]

NameError: name 'apears' is not defined

In [None]:
y

In [None]:
tree = DecisionTreeClassifier(criterion='entropy')
ap_tree = tree.fit(X, y)

In [None]:
tree_ap = export_graphviz(ap_tree, out_file=None, 
                      feature_names=ap_features,
                      class_names=['Apple','Pear'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = Source(tree_ap)  
graph

In [None]:
apears.pop('H/W')    # Delete this feature to make it harder
X = apears.values
ap_features = apears.columns

In [None]:
ap2_tree = tree.fit(X, y)
tree_ap = export_graphviz(ap2_tree, out_file=None, 
                      feature_names=ap_features,
                      class_names=['Apple','Pear'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = Source(tree_ap)  
graph

---
## Iris Data
Sklearn contains a few built-in datasets - datasets are loaded into an object (iris here).  
Changing the `min_samples_leaf` attribute will change the *bushiness* of the tree. 
Two key methods:
1. `fit` method will train the tree from the data.
2. `predict` method will produce class predictions for an array of test data. 

In [None]:
from sklearn.datasets import load_iris
from graphviz import Source
from IPython.display import SVG

iris = load_iris()
tree = DecisionTreeClassifier(criterion='entropy',
                              min_samples_leaf=10
                            )
i_tree = tree.fit(iris.data, iris.target)

In [None]:
iris.feature_names, iris.target_names

In [None]:
iris.data.shape

In [None]:
i_tree # Have a look at the model attributes

In [None]:
tree_im = export_graphviz(i_tree, out_file=None, 
                      feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = Source(tree_im)  
graph

Run a test example.

In [None]:
tn = 55  # index of example for testing
for i,j in zip(iris.data[tn],iris.feature_names):
    print(i,j)
y_pred = i_tree.predict([iris.data[tn]])
print('Prediced class No:',y_pred[0])
print('Prediced class label:',iris.target_names[y_pred[0]])

***
   ## Athlete Data

In [None]:
import pandas as pd
athlete = pd.read_csv('data/AthleteSelection.csv',index_col = 'Athlete')
athlete.head()

In [None]:
y = athlete.pop('Selected').values
X = athlete.values

In [None]:
atree = DecisionTreeClassifier(criterion='entropy')
atree = tree.fit(X,y)

In [None]:
dot_data = export_graphviz(atree, out_file=None, 
                      feature_names=['Speed','Agility'],  
                      class_names=['Selected','No'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = Source(dot_data)  
graph

## Aside: Dealing with category data
Convert to numeric - two options:  
1. `get_dummies` method for pandas.
2. `OneHotEncoding` for sklearn. 

In [None]:
df = pd.DataFrame({'Pet': ['cat', 'dog', 'cat','ferret'], 
                   'Transport': ['bike', 'car', 'car','bike'],
                   'Gender': ['Female','Female','Male','Female']})
df

### Pandas `get_dummies`
The Pandas `get_dummies` method is the easiest way to do One-Hot encoding.  
But if you want to apply the encoding to a test file later, this gets awkward. 

In [None]:
pd.get_dummies(df)

In [None]:
pd.get_dummies(df,drop_first=True)

### Using `OneHotEncoder` to convert category features to numbers


In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
onehot_encoder = OneHotEncoder(sparse=False)
dfOH = onehot_encoder.fit_transform(df)
dfOH

In [None]:
onehot_encoder.get_feature_names()

In [None]:
onehot_encoder.categories_

### `LabelEncoder` also converts category features to numbers
This is more compact.  
But it is not exactly what we want as the numbers are misleading.  
Ferrets are not more like dogs than cats. (Well maybe they are!)

In [None]:
# LabelEncoder only works on single columns so we must 'apply' it to the dataframe. 
label_encoder = LabelEncoder()
labelE = df.apply(label_encoder.fit_transform)
labelE

---
# Restaurant Data 
Predictive features are categories(rather than numeric).

## Using OneHotEncoding
`OneHotEncoder` class has two key methods:   
1. `fit` to 'learn' the transform from the data,
2. `transform` to apply the OneHot transform to the data, the transform can be applied to other (e.g. test) datasets.


In [None]:
import pandas as pd
restaurant = pd.read_csv('data/restaurant.csv',index_col = 'No')
restaurant.head()

In [None]:
y = restaurant.pop('WillWait?').values
X = restaurant.values
X[:3,]

### OneHotEncoder without dropping the first column

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)  # Without drop='first'
restOH = onehot_encoder.fit(restaurant)
restOH_data = restOH.transform(restaurant)

In [None]:
restaurant.columns

In [None]:
restOH.get_feature_names(restaurant.columns)

In [None]:
rtree = DecisionTreeClassifier(criterion='entropy')
rtreeOH = rtree.fit(restOH_data,y)

dot_data = export_graphviz(rtreeOH, 
                      feature_names=restOH.get_feature_names(restaurant.columns),
                      class_names=['Yes','No'], 
                      filled=True, rounded=True,  
                      out_file=None) 
graph = Source(dot_data) 
graph

### OneHotEncoder dropping the first column - so reduced dimensionality

In [None]:
onehot_encoder_wf = OneHotEncoder(sparse=False, drop='first')  # Include drop='first'
restOH = onehot_encoder_wf.fit(restaurant)
restOH_data = restOH.transform(restaurant)

In [None]:
restOH.get_feature_names(restaurant.columns)

In [None]:
rtree = DecisionTreeClassifier(criterion='entropy')
rtreeOH = rtree.fit(restOH_data,y)

dot_data = export_graphviz(rtreeOH, 
                      feature_names=restOH.get_feature_names(restaurant.columns),
                      class_names=['Yes','No'], 
                      filled=True, rounded=True,  
                      out_file=None) 
graph = Source(dot_data) 
graph

---
<h1><span style="color:red">Bonus Material</span></h1>



## Encoding Restaurant data using `get_dummies`

In [None]:
rest1 = pd.get_dummies(restaurant,drop_first=False)
rest2 = pd.get_dummies(restaurant,drop_first=True)

In [None]:
rest1.head()

In [None]:
rest2.head()

In [None]:
X = rest1.values

In [None]:
rtree1 = DecisionTreeClassifier(criterion='entropy')
rtree1.fit(rest1,y) # fit method can be called directly on the data frame
rtree1.fit(X,y)

In [None]:
import graphviz 
dot_data = export_graphviz(rtree1, 
                      feature_names=rest1.columns,
                      class_names=['Yes','No'], 
                      filled=True, rounded=True,  
                      out_file=None) 
graph = graphviz.Source(dot_data) 
graph

In [None]:
rtree2 = DecisionTreeClassifier(criterion='entropy')
X2 = rest2.values
rtree2.fit(X2,y)

dot_data = export_graphviz(rtree2, 
                      feature_names=rest2.columns,
                      class_names=['Yes','No'], 
                      filled=True, rounded=True,  
                      out_file=None) 
graph = graphviz.Source(dot_data) 
graph