In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.impute import KNNImputer, SimpleImputer

import altair as alt
alt.renderers.enable("default");

In [2]:
titanic_df = pd.read_csv("data/titanic_train.csv")
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

# impute missing Age values
# to do this first dummy encode the dataset as KNNImputer requires numeric values
titanic_dummies = pd.get_dummies(titanic_df, drop_first=True)
age_imputer = KNNImputer()
dummies_imputed = age_imputer.fit_transform(titanic_dummies)
dummies_imputed = pd.DataFrame(
    dummies_imputed,
    columns = titanic_dummies.columns
)
titanic_df["Age"] = dummies_imputed["Age"]

# impute missing Embarked values
# only 2 missing, just use SimpleImputer
embark_imputer = SimpleImputer(strategy="most_frequent")
titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

In [3]:
categorical_features = ["Class", "Sex", "Embarked"]
target = "Survived"

In [4]:
def get_operator(feature, side):
    if feature in categorical_features:
        if side == "left":
            operator = "=="
        else:
            operator = "!="
    else:
        if side == "left":
            operator = "<="
        else:
            operator = ">" 
            
    return operator

In [5]:
def get_split_entropy(data, feature, split_val, side):

    operator = get_operator(feature, side)
    side_data = data.query(feature + operator + "@split_val")
    side_percent_target = np.array(side_data[target]).sum()/len(side_data)
    side_entropy = entropy([side_percent_target, 1.-side_percent_target], base=2)
    subset_size = len(side_data)
    return side_entropy, subset_size

In [6]:
def build_IG_Table(data,feature):
    percent_target = np.array(data[target]).sum()/len(np.array(data[target]))
    dataset_entropy = entropy([percent_target, 1.-percent_target], base=2)
    entropies = []
    if feature in categorical_features:
        split_val_range = np.unique(data[feature])
    else:
        split_val_range = np.arange(np.min(data[feature]).astype(int), np.max(data[feature]).astype(int))
    for split_val in split_val_range:
        
        left_entropy, left_size = get_split_entropy(data, feature, split_val, "left")
        right_entropy, right_size = get_split_entropy(data, feature, split_val, "right")
        
        information_gain = dataset_entropy - ((left_size*left_entropy) + (right_size*right_entropy))/len(data)
        entropies.append((split_val, (100.*left_entropy).round(0), (100.*right_entropy).round(0), (100.*information_gain).round(2)))

    entropy_df = pd.DataFrame(entropies, columns = ["Split_Val", "Left_Entropy", "Right_Entropy", "Information_Gain"])
    entropy_df["Feature"] = feature
    return entropy_df

In [7]:
def build_interactive_decision_tree(train_dataset, show_information_gain=False, show_splits=False, subquery=None):
    
    chart = alt.hconcat()
    if subquery:
        data = train_dataset.query(subquery).copy()
    else:
        data = train_dataset.copy()
    
    ig_df_list = []
    # get all information gain calculations into a single table
    features_to_use = [x for x in data.columns if x != target and np.unique(data[x]).shape[0] > 1]
    for feature in features_to_use:
        ig_df_list.append(build_IG_Table(data, feature))
    
    master_ig_df = pd.concat(ig_df_list)
    
    CATEGORICAL_WIDTH = 250
    QUANTITATIVE_WIDTH = 250
    SPLIT_WIDTH = 100
    TEXT_WIDTH = 120
    MAIN_CHART_HEIGHT = 200
    IG_LINE_HEIGHT = 75
    for feature in features_to_use:
        if feature in categorical_features:
            
            component_stack = []

            chart_data = data.loc[:,[feature, target]]
            selector = alt.selection_single(fields=[feature])

            # build IG table
            ig_df = master_ig_df.query("Feature == @feature")
            
            # Top Component

            feature_component = alt.Chart(chart_data).mark_bar().encode(
                x = alt.X(feature, axis=alt.Axis(labelAngle=0), title=None),
                y = "count()",
                color = alt.Color(target, type="nominal", sort=[1,0], legend=alt.Legend(orient="left")),
                opacity = alt.condition(selector, alt.value(1), alt.value(0.3))
            ).properties(
                width = CATEGORICAL_WIDTH,
                height = MAIN_CHART_HEIGHT
            ).add_selection(
                selector
            )
            
            component_stack.append(feature_component)
            
            # End Top Component
            
            # Middle Component
            
            if show_information_gain:
            
                ig_line = alt.Chart(ig_df).mark_line().encode(
                    x = alt.X("Split_Val", title=None),
                    y = alt.Y("Information_Gain", scale = alt.Scale(domain = [0, np.max(master_ig_df["Information_Gain"])]))
                ).properties(
                    width = QUANTITATIVE_WIDTH,
                    height = IG_LINE_HEIGHT
                )
                
                component_stack.append(ig_line)
            
            # End Middle Component
            
            # Bottom Component
            
            if show_splits:
                split_component = alt.Chart(chart_data).mark_bar().encode(
                    y = "Split:N",
                    x = alt.X("count()", scale=alt.Scale(domain = [0, len(data)])),
                    color = alt.Color(target, type="nominal", sort=[1,0])
                ).transform_calculate(
                    split_val = selector[feature]
                ).transform_calculate(
                    Split = "datum." + feature + " == datum.split_val?'Left':'Right'"
                ).properties(
                    width = SPLIT_WIDTH
                )

                split_entropies = alt.Chart(chart_data).mark_text(dx=10).encode(
                    y = "Split:N",
                    x = alt.X("count()", scale=alt.Scale(domain = [0, len(data)])),
                    text = "Entropy:Q"
                ).transform_calculate(
                    split_val = selector[feature]
                ).transform_calculate(
                    Split = "datum." + feature + " == datum.split_val?'Left':'Right'"
                ).transform_lookup(
                    lookup = "split_val",
                    from_ = alt.LookupData(
                        data = ig_df,
                        key = "Split_Val",
                        fields = ["Left_Entropy", "Right_Entropy"]
                    )
                ).transform_calculate(
                    Entropy = "datum.Split == 'Left'?datum.Left_Entropy:datum.Right_Entropy"
                ) 

                split_text = alt.Chart(ig_df).mark_text().encode(
                    text = "chart_text:N",
                ).transform_calculate(
                    selected_val = selector[feature]
                ).transform_filter(
                    (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
                ).transform_calculate(
                    chart_text = "For " + feature + "=" + alt.datum.selected_val
                ).properties(
                    width = TEXT_WIDTH
                )

                ig_text = alt.Chart(ig_df).mark_text().encode(
                    text = "chart_text:N",
                ).transform_calculate(
                    selected_val = selector[feature]
                ).transform_filter(
                    (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
                ).transform_calculate(
                    chart_text = "Information Gain: " + alt.datum.Information_Gain
                ).properties(
                    width = TEXT_WIDTH
                )
                
                component_stack.append((split_component + split_entropies) | (split_text & ig_text))
            
            # End Bottom Component

            chart |= alt.vconcat(*component_stack).properties(title=feature)
        else:
            
            component_stack = []

            chart_data = data.loc[:,[feature, target]].melt(id_vars=target)
            selector = alt.selection_single(fields = [feature], on="mouseover", nearest=True)

            # build IG table
            ig_df = master_ig_df.query("Feature == @feature")

            # Top Component
            
            feature_component = alt.Chart(chart_data).transform_density(
                density='value',
                bandwidth=0.5,
                groupby=['variable', target],
                steps=20
            ).transform_joinaggregate(
                max_density = 'max(density)'
            ).mark_line().encode(
                alt.X('value:Q', title=None),
                alt.Y('density:Q', axis=alt.Axis(labelAngle=0, titleAngle=0)),
                alt.Color(target, type="nominal")
            ).properties(
                width=QUANTITATIVE_WIDTH,
                height = MAIN_CHART_HEIGHT
            )

            selection_bar_df = pd.DataFrame(
                np.arange(np.min(data[feature]).astype(int), np.max(data[feature]).astype(int)),
                columns = [feature]
            )
            selection_bar_df["Height"] = 1

            selection_bars = alt.Chart(selection_bar_df).mark_bar(
                size=.8*QUANTITATIVE_WIDTH/len(selection_bar_df),
                binSpacing=0,
                align="left"
            ).transform_calculate(
                key=selector[feature]
            ).transform_calculate(
                val='datum.'+feature
            ).encode(
                x = alt.X(feature, title = None),
                y = alt.Y("Height", axis=None),
                color = alt.value("lightgrey"),
                opacity = alt.condition(alt.datum.val <= alt.datum.key, alt.value(0.3), alt.value(0))
            ).properties(
                height = MAIN_CHART_HEIGHT
            ).add_selection(
                selector
            )    
            
            component_stack.append(
                (selection_bars + feature_component).resolve_scale(y="independent")
            )
            
            # End Top Component

            # Middle Component
            if show_information_gain:
                ig_line = alt.Chart(ig_df).mark_line().encode(
                    x = alt.X("Split_Val", title=None),
                    y = alt.Y("Information_Gain", scale = alt.Scale(domain = [0, np.max(master_ig_df["Information_Gain"])]))
                ).properties(
                    width = QUANTITATIVE_WIDTH,
                    height = IG_LINE_HEIGHT
                )
                component_stack.append(ig_line)
                
            # End Middle Component
            
            # Bottom Component
            
            if show_splits:
                split_component = alt.Chart(chart_data).mark_bar().encode(
                    y = "Split:N",
                    x = alt.X("count()", scale=alt.Scale(domain = [0, len(data)])),
                    color = alt.Color(target, type="nominal", sort=[1,0])
                ).transform_calculate(
                    split_val = selector[feature]
                ).transform_calculate(
                    Split = "datum.value <= datum.split_val?'Left':'Right'"
                ).properties(
                    width = SPLIT_WIDTH
                )   

                split_entropies = alt.Chart(chart_data).mark_text(dx=10).encode(
                    y = "Split:N",
                    x = alt.X("count()", scale=alt.Scale(domain = [0, len(data)])),
                    text = "Entropy:Q"
                ).transform_calculate(
                    split_val = selector[feature]
                ).transform_calculate(
                    Split = "datum.value <= datum.split_val?'Left':'Right'"
                ).transform_lookup(
                    lookup = "split_val",
                    from_ = alt.LookupData(
                        data = ig_df,
                        key = "Split_Val",
                        fields = ["Left_Entropy", "Right_Entropy"]
                    )
                ).transform_calculate(
                    Entropy = "datum.Split == 'Left'?datum.Left_Entropy:datum.Right_Entropy"
                )           

                split_text = alt.Chart(ig_df).mark_text().encode(
                    text = "chart_text:N",
                ).transform_calculate(
                    selected_val = selector[feature]
                ).transform_filter(
                    (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
                ).transform_calculate(
                    chart_text = "For " + feature + "<=" + alt.datum.selected_val
                ).properties(
                    width = TEXT_WIDTH
                )

                ig_text = alt.Chart(ig_df).mark_text().encode(
                    text = "chart_text:N",
                ).transform_calculate(
                    selected_val = selector[feature]
                ).transform_filter(
                    (alt.datum.Split_Val <= alt.datum.selected_val) & (alt.datum.Split_Val >= alt.datum.selected_val)
                ).transform_calculate(
                    chart_text = "Information Gain: " + alt.datum.Information_Gain
                ).properties(
                    width = TEXT_WIDTH
                )
                
                component_stack.append((split_component) | (split_text & ig_text))
                
                #End Bottom Component

            chart |= alt.vconcat(*component_stack).resolve_scale(x="shared").properties(title=feature)

    return chart.configure_title(anchor="middle")

<img align="right" src="notebook_resources/decision_tree_header_image.png" width=300 height=300>

# How Do Decision Trees Work?

### by Matt Britton

#### The goal of this notebook is to teach you about Decision Trees, a common model type in machine learning.
#### We'll do this through a combination of thought experiments, data visualizations, and a "take home" exercise. 

#### By the end of this tutorial, you should be able to:

- Describe the structure and function of a decision tree.
- Construct a small decision tree by hand using the concepts of _Entropy_ and _Information Gain_.
- Reason about how algorithmic implementations (like in scikit-learn) build decision trees at scale.

## Accessing This Notebook

You can access this notebook in three different formats. The code is available in a GitHub repository, where you can download it to run on your own machine. The NBViewer version lets you interact with the charts and loads quickly. Finally, Binder gives you access to the live notebook running in the cloud, if you are interested in exploring and modifying the code without setting up a local environment.

|  |Source        | Link          | QR Code|
| -------------| ------------- | --------------- | ----- |
| <img src="notebook_resources/GitHub_logo.png" width=100 height=100> |  GitHub |  https://github.com/MattJBritton/InteractiveDecisionTrees | <img src="notebook_resources/GitHub_QR_code.png" width=100 height=100> |
| <img src="notebook_resources/jupyter_logo.png" width=100 height=100> | NBViewer | https://nbviewer.jupyter.org/github/MattJBritton/InteractiveDecisionTrees/blob/master/InteractiveDecisionTrees.ipynb      |   <img src="notebook_resources/NBViewer_QR_code.png" width=100 height=100> |
| <img src="notebook_resources/binder_logo.svg" width=100 height=100> | Binder | https://mybinder.org/v2/gh/MattJBritton/InteractiveDecisionTrees/master?filepath=InteractiveDecisionTrees.ipynb | <img src="notebook_resources/binder_QR_code.png" width=100 height=100> |

## Background Knowledge

This notebook builds on basic knowledge of machine learning, such as:
- Supervised Learning 
- Classification
- Model Evaluation

It's also helpful to understand how at least one other classification model type works, such as:
- Logistic Regression
- K-Nearest Neighbors

Lastly, this tutorial leverages several techniques used in EDA for understanding feature distributions, including bar charts, line charts, and kernel density estimate (KDE) plots. Knowledge of how to read these charts would be helpful.

## What is a Decision Tree?

Think back to other supervised classification algorithms. A decision tree is another type of predictive model, and so it works like Logistic Regression does. As a data scientist, you will:
-  __Train__ it based on data.
-  __Interpret__ the mathematical model of the relationships of predictors to the target
- __Predict__ what will hapen to a new data point
- __Evaluate__ its accuracy on known data.

Each type of model represents a way of thinking about the world. Whereas Logistic Regression is focused on how fast Y changes as X increases, decision trees are best at capturing a different kind of phenomena: if/then choices.

## Everyday Examples of Decision Trees

You’ve undoubtedly made a decision tree at some point in your life. Let’s take the example of packing for a vacation. Should you bring a bathing suit?
Your decision process might look a little like this:

#### A simple model for making a decision
<img align="left" src="notebook_resources/swimsuit_decision_tree.png">

__Things to Note__
- This tree is asymmetrical. That is, some branches require more info to make a decision than others.
- The purpose of the model is to help us make an educated guess. If we make the decision provided by a given leaf node, we'll be right a large majority of the time, but not all. 
- For example, a destination without a place to swim might occasionally have a water park we didn't know about!

## Building Good Decision Trees

This model makes intuitive sense to us. But remember, in machine learning, we evaluate models based on how well they predict the future, or how useful they are to us. 

__How might we evaluate this model? What characteristics differentiate a good decision tree from a bad one?__

A couple examples will help us.

| <img src="notebook_resources/thumbs_up.png" width=150 height=150>        | <img src="notebook_resources/thumbs_down.png" width=150 height=150>       |
| ------------- | --------------- |
| <img src="notebook_resources/swimsuit_good_split.png"> | <img src="notebook_resources/swimsuit_bad_split_before.png">  |

| <img src="notebook_resources/thumbs_up.png" width=150 height=150>        | <img src="notebook_resources/thumbs_down.png" width=150 height=150>       |
| ------------- | --------------- |
| <img src="notebook_resources/swimsuit_good_split.png"> | <img src="notebook_resources/swimsuit_bad_split_after.png">  |

#### Our goal in making a decision tree is to produce "pure" child nodes with as few questions as possible.

Hold on to this concept of "purity" because it will be useful again in a second. 

For now, let's switch gears to a real data science problem.

## Building a Decision Tree by Hand on the Titanic Dataset

So, if you are building a decision tree for this dataset, how would you do it? What’s a good first split?

In [9]:
titanic_df.head()

Unnamed: 0,Survived,Sex,Age,Embarked,Family_Size,Class
0,0,male,22.0,S,1,Third
1,1,female,38.0,C,1,First
2,1,female,26.0,S,0,Third
3,1,female,35.0,S,1,First
4,0,male,35.0,S,0,Third


Without any statistics or visual instruments, all we can do is leverage experience with this data (if we have it) or general domain knowledge about shipwrecks and social structures to build a model. 

Consider, though, that as a data scientist, you will frequently be asked to build predictive models for datasets from new domains, or that are too big to know everything about. We need a more data-driven way to choose good splits (and validate our intuitions!)

Below you'll find a tool I've built to probe this dataset. It shows the distributions for the 5 features in our subset, with stacked bar charts for the categorical features and KDE plots for the quantitative features.

#### Use the Distributions Below to Investigate the Splits we Proposed Above.

Based on the details of the dataset, which one do you think is best?

In [10]:
build_interactive_decision_tree(titanic_df).properties(title = "Distributions for the 5 Features in the Titanic Dataset by Survived Y/N")

#### Evaluating A Split

Sex and Class seem to have pretty big disparities across categories. Also, children under 10 seem to have a higher survival rate, and individuals travelling alone (Family Size < 1) seem to have fared more poorly.

So now that we have some potential splits, how can we evaluate whether they are any good? How do we compare multiple options?

Remember the “pure child nodes” we talked about with the toy swimsuit problem? We need a quantitative measure of how much a given split increases purity.

## Entropy and Information Gain

#### Entropy

A measure of how "pure" a node is.
- Based on the ratio of the two classes, e.g. survivors to non-survivors.
- Ranges from 0 to 1. 0 means a pure node (all survivors or vice versa) and 1 means a 50/50 split (totally random). 

#### Information Gain

A measure of how much a split decreases entropy (a bigger decrease is better, as 0 is the goal). 
- Equal to the entropy of the parent node minus the weighted average of the child nodes.
- Algorithm that build decision trees find the split at each node with the highest information gain.

#### Formulas
$ Entropy = p(A)*log(p(A)) - p(B)*log(p(B))$

where $p(A)$ is the probability of the target being True (e.g. Survived), and $p(B)$ is the probability of the target being False.

$Information Gain = Entropy(parent) - WeightedAverage(Entropy(children))$

$Information Gain = Entropy(S) - \sum_{children} \frac{|S_{child}|}{|S|} Entropy(S_{child})$

where $S$ is the dataset and $S_{child}$ denotes a subset

In [22]:
# The calculations used in the image below

# Full Data Set
# Entropy
entropy(titanic_df.groupby("Survived").size(), base=2)
# Survival Rate
len(titanic_df.query("Survived == 1"))/len(titanic_df)
# People
len(titanic_df)

# Third Class
third_class_subset = titanic_df.query("Class == 'Third'")
# Entropy
entropy(third_class_subset.groupby("Survived").size(), base=2)
# Survival Rate
len(third_class_subset.query("Survived == 1"))/len(third_class_subset)
# People
len(third_class_subset)

# Upper Classes
upper_class_subset = titanic_df.query("Class != 'Third'")
# Entropy
entropy(upper_class_subset.groupby("Survived").size(), base=2)
# Survival Rate
len(upper_class_subset.query("Survived == 1"))/len(upper_class_subset)
# People
len(upper_class_subset);

#### A Visual Will Help Make this Clearer

<img src = "notebook_resources/titanic_entropy_calculation.png">

## Let's Build Our Own Decision Tree Using an Interactive Visualization

With our Information Gain score in hand, we’re now ready to pick the best possible first split.

In [31]:
build_interactive_decision_tree(
    titanic_df, show_splits=True
).properties(title = "Interact and Explore the Entropy and Information Gain for Different Splits")

#### With Male/Female as our first split, build the rest of a Depth-2 Decision Tree

<img src="notebook_resources/titanic_depth_2_template.png">

In [23]:
build_interactive_decision_tree(
    titanic_df, show_splits=True, show_information_gain=True, subquery = "Sex == 'female'"
).properties(title = "Select the Best Split for the Female Subset")

In [24]:
build_interactive_decision_tree(
    titanic_df, show_splits=True, show_information_gain=True, subquery = "Sex == 'male'"
).properties(title = "Select the Best Split for the Male Subset")

## Building a basic decision tree in Scikit-Learn

In [25]:
# dataset from https://www.kaggle.com/c/titanic/data
# Load dataset
titanic_df = pd.read_csv("data/titanic_train.csv")
target = "Survived"
# Some Feature Engineering and Cleaning
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
# Drop some features we don't need for this example
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

In [26]:
# impute missing Embarked values
# only 2 missing, just use SimpleImputer
embark_imputer = SimpleImputer(strategy="most_frequent")
titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

# impute missing Age values
# to do this first dummy encode the dataset as KNNImputer requires numeric values
data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
age_imputer = KNNImputer()
dummies_imputed = age_imputer.fit_transform(data_for_ml)
data_for_ml = pd.DataFrame(
    dummies_imputed,
    columns = data_for_ml.columns
)

In [27]:
# Build X and y for passing to classifier
X = data_for_ml.drop(target, axis=1)
y = data_for_ml[target]
# Instantiate classifier
sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
# Fit the model to our dataset (which builds the decision tree)
sklearn_dt.fit(X, y);

In [28]:
# Calculate the total information gain from the root to the grandchild leaf nodes
leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
leaf_node_entropies = np.apply_along_axis(
    lambda x: entropy(x, base=2),
    2,
    leaf_nodes
)
leaf_node_sizes = np.apply_along_axis(
    np.sum,
    2,
    leaf_nodes
)
print(
    f"Total Information Gain: "
    f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
)

Total Information Gain: 0.312


In [29]:
# Use the model's score function to calculate the accuracy
print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")

Model Accuracy: 0.796


In [30]:
# This code prints the structure of the decision tree
# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
print("Structure of Decision Tree")
print(
    export_text(
        sklearn_dt,
        feature_names = list(X.columns)
    ).replace(
        "class: 1.0", "Survived"
    ).replace(
        "class: 0.0", "Perished"
    ).replace(
        "<= 0.50", "is False"
    ).replace(
        ">  0.50", "is True"
    )
)

Structure of Decision Tree
|--- Sex_male is False
|   |--- Class_Third is False
|   |   |--- Survived
|   |--- Class_Third is True
|   |   |--- Perished
|--- Sex_male is True
|   |--- Age <= 6.50
|   |   |--- Survived
|   |--- Age >  6.50
|   |   |--- Perished



## A "Take-Home" Exercise

Hopefully these tools are useful for building a solid foundational understanding of decision trees. Practice will help you develop a better sense of what decision trees built on real data look like. In that spirit, consider building another decision tree by hand, using a common data science dataset. Examples include [Credit Scores](https://archive.ics.uci.edu/ml/datasets/Credit+Approval), [Bike Sharing](http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset), and [Breast Cancer](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer).

As you are doing this, consider the following questions.

- Are there any difficult choices to make when choosing splits? When the Information Gain of two options is close, how do you make a decision? Are there other factors?
- Does your "algorithm" for choosing splits work better for quantitative or categorical features?
- Experiment with building a deeper decision tree. It need not be symmetrical (i.e. feel free to improve just one branch). Does this make a better tree? Why or why not? Hint: think about this in the context of overfitting.
- In our Titanic example above, we managed to choose the same splits as scikit-learn did. Is this possible for all datasets? What, if anything, would cause an algorithm to do much better than humans at building a decision tree? Is there anything that humans would do better?