# Preliminaries

In [50]:
# import modules
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from sklearn import model_selection

from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [51]:
# base classes

class Node:
    pass

class Tree:
    def __init__(self):
        self.root = Node()
    
    def find_leaf(self, x):
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

# Density Tree

In [52]:
class DensityTree(Tree):
    def __init__(self):
        super(DensityTree, self).__init__()
        
    def train(self, data, prior, n_min=20):
        '''
        data: the feature matrix for the digit under consideration
        prior: the prior probability of this digit
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        self.prior = prior
        N, D = data.shape
        D_try = int(np.sqrt(D)) # number of features to consider for each split decision

        # find and remember the tree's bounding box, 
        # i.e. the lower and upper limits of the training feature set
        m, M = np.min(data, axis=0), np.max(data, axis=0)
        self.box = m.copy(), M.copy()
        
        # identify invalid features and adjust the bounding box
        # (If m[j] == M[j] for some j, the bounding box has zero volume, 
        #  causing divide-by-zero errors later on. We must exclude these
        #  features from splitting and adjust the bounding box limits 
        #  such that invalid features have no effect on the volume.)
        valid_features   = np.where(m != M)[0]
        invalid_features = np.where(m == M)[0]
        M[invalid_features] = m[invalid_features] + 1
        
        
        # initialize the root node
        self.root.data = data
        self.root.box = m.copy(), M.copy()

        # build the tree
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min:
                # Call 'make_density_split_node()' with 'D_try' randomly selected 
                # indices from 'valid_features'. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                selected_indices = valid_features[np.random.choice(len(valid_features), size=D_try, replace=False)]
                children = make_density_split_node(node, N, selected_indices)
                stack.extend(children)
            else:
                # Call 'make_density_leaf_node()' to turn 'node' into a leaf node.
                make_density_leaf_node(node, N)
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # return p(x | y) * p(y) if x is within the tree's bounding box 
        # and return 0 otherwise
        if (x >= self.box[0]).all() and (x <= self.box[1]).all():
            return leaf.response * self.prior
        return 0

In [53]:
def rolling_mean(x, n):
    """ see https://stackoverflow.com/questions/14313510/how-to-calculate-rolling-moving-average-using-numpy-scipy """
    return np.convolve(x, np.ones(n), mode="valid") / n

def make_density_split_node(node, N, feature_indices):
    '''
    node: the node to be split
    N:    the total number of training instances for the current class
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape
    m, M = node.box
    
    # volume of node
    V = np.prod(M - m)
    
    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = float("inf")
    j_min, t_min = None, None
    
    for j in feature_indices:
        # Hint: For each feature considered, first remove duplicate feature values using 
        # 'np.unique()'. Describe here why this is necessary.
        
        # Duplicate features would lead to a threshold that lies directly on a feature
        # if such a threshold is selected as the optimal threshold, we cannot assign the feature
        # to left or right
        data_unique = np.sort(np.unique(node.data[:, j]))
        
        # all points have the same j-th value => can't split
        if len(data_unique) == 1:
            continue
        
        # Compute candidate thresholds
        tj = rolling_mean(data_unique, 2)
        
        
        Ns_left = np.array([len(node.data[node.data[:, j] < t]) for t in tj])
        Ns_right = node.data.shape[0] - Ns_left
        
        
        Vs_left = V / (M[j] - m[j]) * (tj - m[j])
        Vs_right = V / (M[j] - m[j]) * (M[j] - tj)

        loo_errors_left = Ns_left/(N*Vs_left) * (Ns_left/N - 2*((Ns_left - 1)/(N - 1)))
        loo_errors_right = Ns_right/(N*Vs_right) * (Ns_right/N - 2*((Ns_right - 1)/(N - 1)))

        loo_errors = loo_errors_left + loo_errors_right
        
        loo_error_min_index = np.argmin(loo_errors)
        if loo_errors[loo_error_min_index] < e_min:
            e_min = loo_errors[loo_error_min_index]
            j_min = j
            t_min = tj[loo_error_min_index]
        
        

    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and bounding boxes
    # according to the optimal split found above
    mask = node.data[:, j_min] < t_min
    
    left.data = node.data[mask] # store data in left node -- for subsequent splits
    left.box = m.copy(), M.copy() # store bounding box in left node
    left.box[1][j_min] = t_min
    
    
    right.data = node.data[~mask]
    right.box = m.copy(), M.copy()
    right.box[0][j_min] = t_min

    
    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right

In [54]:
def make_density_leaf_node(node, N):
    '''
    node: the node to become a leaf
    N:    the total number of training instances for the current class
    '''
    # compute and store leaf response
    n = node.data.shape[0]
    v = np.prod(node.box[1] - node.box[0])
    node.response = n/(N*v)

# Decision Tree

In [55]:
class DecisionTree(Tree):
    def __init__(self):
        super(DecisionTree, self).__init__()
        
    def train(self, data, labels, n_min=20):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D)) # how many features to consider for each split decision

        # initialize the root node
        self.root.data = data
        self.root.labels = labels
        
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min and not node_is_pure(node):
                # Call 'make_decision_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                selected_indices = np.random.choice(D, size=D_try, replace=False)
                children = make_decision_split_node(node, selected_indices)
                stack.extend(children)
            else:
                # Call 'make_decision_leaf_node()' to turn 'node' into a leaf node.
                make_decision_leaf_node(node)
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return leaf.response

In [56]:
def make_decision_split_node(node, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = float("inf")
    j_min, t_min = None, None
    
    for j in feature_indices:
        # Hint: For each feature considered, first remove duplicate feature values using 
        # 'np.unique()'. Describe here why this is necessary.
        data_unique = np.sort(np.unique(node.data[:, j]))
        
        # all points have the same j-th value => can't split
        if len(data_unique) == 1:
            continue
        
        # Compute candidate thresholds
        tj = rolling_mean(data_unique, 2)
        
        # Illustration: for loop - hint: vectorized version is possible
        for t in tj:
            mask = node.data[:, j] < t
            # left:
            left = node.data[mask]
            left_label_unique, left_label_count = np.unique(node.labels[mask], return_counts = True)
            N_left = len(left)
            gini_left = N_left - np.sum(left_label_count*left_label_count) / N_left
            
            
            right = node.data[~mask]
            right_label_unique, right_label_count = np.unique(node.labels[~mask], return_counts = True)
            N_right = len(right)
            gini_right = N_right - np.sum(right_label_count*right_label_count) / N_right

            
            gini = gini_left + gini_right
            
            # choose the best threshold that
            if gini < e_min:
                e_min = gini
                j_min = j
                t_min = t


    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    mask = node.data[:, j_min] < t_min
    
    left.data = node.data[mask] # data in left node
    left.labels = node.labels[mask] # corresponding labels
    right.data = node.data[~mask]
    right.labels = node.labels[~mask]

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right    

In [57]:
def make_decision_leaf_node(node):
    '''
    node: the node to become a leaf
    '''
    # compute and store leaf response
    node.N = node.data.shape[0]
    node.response = np.bincount(node.labels).argmax()

In [58]:
def node_is_pure(node):
    '''
    check if 'node' contains only instances of the same digit
    '''
    return len(np.unique(node.labels)) == 1

# Evaluation of Density and Decision Tree

In [59]:
# read and prepare the digits data
digits = load_digits()

data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]

In [60]:
# train trees, plot training error confusion matrices, and comment on your results
trees = []
for i in range(10):
    density_tree = DensityTree()
    data_digits = data[target == i]
    density_tree.train(data_digits, data_digits.shape[0] / len(target), n_min=10)
    trees.append(density_tree)
    
    
predicted = [np.argmax([density_tree.predict(digit) for density_tree in trees]) for digit in data]

pairs, counts = np.unique(np.stack((predicted, target)).T, axis=0, return_counts=True)


confusion_matrix = np.zeros((10,10))
for pair, count in zip(pairs, counts):
    confusion_matrix[pair[0], pair[1]] = count

for i in range(10):
    confusion_matrix[:,i] /= np.sum(target == i)


# display as Pandas DataFrame
def fade_zeros(s):
    return ["color: lightgray" if (v == 0) else "color: black" for v in s]

print("confusion matrix for density tree\n(percentages in a column sum up to 100%)")
display(
    pd.DataFrame(data=confusion_matrix[:, :]*100, index=target_names, columns=target_names)
    .rename_axis("true", axis="columns")
    .rename_axis("predicted", axis="rows")
    .style.apply(fade_zeros)
    .format("{0:.2f}%")
)

confusion matrix for density tree
(percentages in a column sum up to 100%)


true,0,1,2,3,4,5,6,7,8,9
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,99.44%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
1,0.00%,61.54%,13.56%,2.19%,0.00%,0.00%,0.00%,0.00%,5.75%,3.89%
2,0.00%,2.20%,57.63%,2.19%,0.00%,0.00%,0.00%,0.00%,2.30%,0.00%
3,0.00%,0.00%,5.65%,59.56%,0.00%,7.14%,0.00%,0.00%,0.57%,10.56%
4,0.56%,7.14%,0.00%,0.00%,87.85%,0.55%,0.00%,1.12%,0.57%,1.11%
5,0.00%,3.30%,0.00%,3.83%,0.00%,86.81%,0.55%,0.56%,4.02%,1.11%
6,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,99.45%,0.00%,0.00%,0.00%
7,0.00%,0.55%,0.00%,2.73%,11.60%,0.55%,0.00%,96.09%,2.87%,5.56%
8,0.00%,25.27%,23.16%,26.23%,0.55%,3.85%,0.00%,2.23%,83.91%,12.22%
9,0.00%,0.00%,0.00%,3.28%,0.00%,1.10%,0.00%,0.00%,0.00%,65.56%


The error rate is significant for some digits, with the highest error
rate being 42.37% in the classification of the digit 2. Often, 2s are
classified as 8s, but true 8s are not often classified as 2s. The
next highest error rates are in the classification of 3s and 1s
respectively, which are also both often predicted as 8s.

The lowest error rate can be seen for the digits 6 and 0, which are only ever
rarely mistaken to be a 5 or 4 respectively.

The total error rate in a balanced set would be 20.22%.

In [61]:
decision_tree = DecisionTree()
decision_tree.train(data, target)


predicted = [decision_tree.predict(digit) for digit in data]


pairs, counts = np.unique(np.stack((predicted, target)).T, axis=0, return_counts=True)


confusion_matrix = np.zeros((10,10))
for pair, count in zip(pairs, counts):
    confusion_matrix[pair[0], pair[1]] = count
    
    
for i in range(10):
    confusion_matrix[:,i] /= np.sum(target == i)



# display as Pandas DataFrame
def fade_zeros(s):
    return ["color: lightgray" if (v == 0) else "color: black" for v in s]


print("confusion matrix for decision tree\n(percentages in a column sum up to 100%)")
display(
    pd.DataFrame(data=confusion_matrix[:, :]*100, index=target_names, columns=target_names)
    .rename_axis("true", axis="columns")
    .rename_axis("predicted", axis="rows")
    .style.apply(fade_zeros)
    .format("{0:.2f}%")
)

confusion matrix for decision tree
(percentages in a column sum up to 100%)


true,0,1,2,3,4,5,6,7,8,9
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,95.51%,0.00%,0.00%,0.00%,0.00%,0.55%,0.00%,0.00%,1.15%,0.56%
1,1.69%,89.56%,5.65%,0.55%,2.21%,1.10%,2.21%,5.03%,8.62%,1.11%
2,0.00%,2.75%,89.83%,4.37%,0.00%,0.55%,0.00%,0.00%,5.17%,0.56%
3,0.00%,1.10%,1.13%,85.25%,0.00%,4.40%,1.10%,0.56%,5.75%,7.22%
4,0.56%,2.75%,0.56%,1.09%,95.03%,1.10%,2.76%,5.59%,0.57%,1.67%
5,0.00%,1.10%,1.69%,2.19%,0.55%,90.11%,1.10%,2.23%,0.00%,4.44%
6,0.00%,0.00%,0.00%,2.73%,1.10%,0.00%,92.27%,0.56%,0.00%,0.00%
7,0.00%,0.00%,0.00%,0.55%,0.55%,0.00%,0.00%,81.01%,1.72%,0.00%
8,1.12%,2.75%,1.13%,2.19%,0.55%,0.55%,0.55%,2.23%,74.71%,4.44%
9,1.12%,0.00%,0.00%,1.09%,0.00%,1.65%,0.00%,2.79%,2.30%,80.00%


This method has much better error rates for nearly all classifications.
This can already be seen in the error rate for a balanced set, which
would be 12.67%.

Most often incorrectly classified is now the digit 8, which is most often
mistaken for a 1. The digit most often correctly predicted is 0.

Overall, the decision tree seems to be a better choice for this dataset,
judging by its training errors. The incorrect guesses for the density
tree tend to fall onto a single digit in each column, whereas the
decision tree is much more spread out.

As such, the density tree seems to mistake digits for others at times,
whereas the decision tree tends to just guess a seemingly random digit
when it does not know the correct answer.


# Density and Decision Forest

In [62]:
class DensityForest:
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DensityTree() for i in range(n_trees)]
    
    def train(self, data, prior, n_min=20):
        N = data.shape[0]
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            training_set = data[np.random.choice(N, size=N, replace=True)]
            tree.train(training_set, prior)
            

    def predict(self, x):
        # compute the ensemble prediction
        return np.mean([tree.predict(x) for tree in self.trees])

In [63]:
class DecisionForest:
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DecisionTree() for i in range(n_trees)]
    
    def train(self, data, labels, n_min=0):
        N = data.shape[0]
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            training_set_indices = np.random.choice(N, size=N, replace=True)
            tree.train(data[training_set_indices], labels[training_set_indices])

    def predict(self, x):
        # compute the ensemble prediction
        return np.bincount([tree.predict(x) for tree in self.trees]).argmax()

# Evaluation of Density and Decision Forest

In [64]:
# train forests (with 20 trees per forest), plot training error confusion matrices, and comment on your results

forests = []
for i in range(10):
    density_forest = DensityForest(20)
    data_digits = data[target == i]
    density_forest.train(data_digits, data_digits.shape[0] / len(target))
    forests.append(density_forest)
    
    
predicted = [np.argmax([density_forest.predict(digit) for density_forest in forests]) for digit in data]

pairs, counts = np.unique(np.stack((predicted, target)).T, axis=0, return_counts=True)


confusion_matrix = np.zeros((10,10))
for pair, count in zip(pairs, counts):
    confusion_matrix[pair[0], pair[1]] = count

for i in range(10):
    confusion_matrix[:,i] /= np.sum(target == i)



# display as Pandas DataFrame
def fade_zeros(s):
    return ["color: lightgray" if (v == 0) else "color: black" for v in s]

print("confusion matrix for density forest\n(percentages in a column sum up to 100%)")
display(
    pd.DataFrame(data=confusion_matrix[:, :]*100, index=target_names, columns=target_names)
    .rename_axis("true", axis="columns")
    .rename_axis("predicted", axis="rows")
    .style.apply(fade_zeros)
    .format("{0:.2f}%")
)

confusion matrix for density forest
(percentages in a column sum up to 100%)


true,0,1,2,3,4,5,6,7,8,9
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
1,0.00%,89.56%,1.69%,0.00%,0.55%,0.00%,0.00%,0.56%,5.17%,0.00%
2,0.00%,3.85%,77.97%,1.09%,0.00%,0.00%,0.00%,0.00%,0.57%,0.00%
3,0.00%,0.00%,9.60%,85.25%,0.00%,7.69%,0.00%,0.00%,2.87%,21.67%
4,0.00%,1.10%,0.00%,0.00%,94.48%,0.55%,0.00%,0.56%,0.57%,3.89%
5,0.00%,0.55%,0.00%,0.55%,0.00%,79.67%,0.00%,0.00%,0.00%,0.56%
6,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%
7,0.00%,0.00%,0.00%,1.09%,4.97%,0.55%,0.00%,97.77%,0.57%,6.11%
8,0.00%,4.95%,10.73%,11.48%,0.00%,9.34%,0.00%,1.12%,90.23%,11.11%
9,0.00%,0.00%,0.00%,0.55%,0.00%,2.20%,0.00%,0.00%,0.00%,56.67%


Naturally, the forests have much better overall error rates than the
individual trees. The Density Forest seems to have the same problem
as the Density Tree, however, in that it tends to mistake a digit
for another.

The highest error rate is now seen in the classification of 9s, which
are often mistaken for 3s or 8s.

The overall error rate for a balanced set is 12.84%, which is a decent
improvement over the individual density tree.

In [65]:
decision_forest = DecisionForest(20)
decision_forest.train(data, target)


predicted = [decision_forest.predict(digit) for digit in data]

pairs, counts = np.unique(np.stack((predicted, target)).T, axis=0, return_counts=True)


confusion_matrix = np.zeros((10,10))
for pair, count in zip(pairs, counts):
    confusion_matrix[pair[0], pair[1]] = count
    
    
for i in range(10):
    confusion_matrix[:,i] /= np.sum(target == i)



# display as Pandas DataFrame
def fade_zeros(s):
    return ["color: lightgray" if (v == 0) else "color: black" for v in s]


print("confusion matrix for decision forest\n(percentages in a column sum up to 100%)")
display(
    pd.DataFrame(data=confusion_matrix[:, :]*100, index=target_names, columns=target_names)
    .rename_axis("true", axis="columns")
    .rename_axis("predicted", axis="rows")
    .style.apply(fade_zeros)
    .format("{0:.2f}%")
)

confusion matrix for decision forest
(percentages in a column sum up to 100%)


true,0,1,2,3,4,5,6,7,8,9
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,99.44%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
1,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.55%,0.00%,1.72%,0.56%
2,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
3,0.00%,0.00%,0.00%,100.00%,0.00%,0.55%,0.00%,0.00%,0.57%,0.56%
4,0.56%,0.00%,0.00%,0.00%,99.45%,0.00%,0.00%,0.00%,0.00%,0.00%
5,0.00%,0.00%,0.00%,0.00%,0.00%,99.45%,0.00%,0.00%,0.57%,0.56%
6,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,99.45%,0.00%,0.00%,0.00%
7,0.00%,0.00%,0.00%,0.00%,0.55%,0.00%,0.00%,100.00%,0.57%,1.11%
8,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,96.55%,0.56%
9,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,96.67%


The Decision Forest has an excellent overall error rate for a balanced set
of 0.899%, which is a massive improvement over the individual Decision Tree.

As an additional effect, it no longer presents the problem the individual
decision tree had, of randomly guessing a digit when it knows no correct
answer. This may be because for our training set, there are almost no
digits where it now knows no correct answer.

In [66]:
sklearn_forest = RandomForestClassifier(n_estimators=20)
sklearn_forest.fit(data, target)


predicted = [sklearn_forest.predict([digit])[0] for digit in data]

pairs, counts = np.unique(np.stack((predicted, target)).T, axis=0, return_counts=True)


confusion_matrix = np.zeros((10,10))
for pair, count in zip(pairs, counts):
    confusion_matrix[pair[0], pair[1]] = count
    
    
for i in range(10):
    confusion_matrix[:,i] /= np.sum(target == i)


# display as Pandas DataFrame
def fade_zeros(s):
    return ["color: lightgray" if (v == 0) else "color: black" for v in s]


print("confusion matrix for sklearn\n(percentages in a column sum up to 100%)")
display(
    pd.DataFrame(data=confusion_matrix[:, :]*100, index=target_names, columns=target_names)
    .rename_axis("true", axis="columns")
    .rename_axis("predicted", axis="rows")
    .style.apply(fade_zeros)
    .format("{0:.2f}%")
)

confusion matrix for sklearn
(percentages in a column sum up to 100%)


true,0,1,2,3,4,5,6,7,8,9
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
1,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
2,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
3,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
4,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%,0.00%
5,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%,0.00%
6,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%,0.00%
7,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%,0.00%
8,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%
9,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%


`sklearn`'s Forest has a 0% training error rate.
This may, however, hint at overfitting. Since we do not have a test set to
evaluate this on, we cannot make a definite conclusion.

