# Introduction

A basic decision tree.

# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
# Load data

df = pd.read_csv("../../data/Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Drop Id

df = df.drop(columns=["Id"])

In [54]:
# Seperate features and labels

features = df.drop(columns=["Species"]).values
labels = df["Species"].unique()
labels_encoded = np.argmax(pd.get_dummies(df["Species"]).astype(int).values, axis=1)

features[:5], labels_encoded[:5], labels

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0], dtype=int64),
 array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object))

In [55]:
# Train and test splits

X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, test_size=0.2)

In [56]:
print(f'Training dataset size: {len(X_train)}')
print(f'Test dataset size: {len(X_test)}')

Training dataset size: 120
Test dataset size: 30


# Model

In [46]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [15]:
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)

In [None]:
def split_data(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = X[:, feature_index] > threshold
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

In [None]:
def best_split(X, y):
    n_samples, n_features = X.shape
    best_gini = float("inf")
    best_index = None
    best_threshold = None

    for feature_index in range(n_features):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_data(X, y, feature_index, threshold)
            
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            
            gini_left = gini(y_left)
            gini_right = gini(y_right)
            weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)

            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_index = feature_index
                best_threshold = threshold

    return best_index, best_threshold


In [47]:
def build_tree(X, y, max_depth=5, min_samples_split=2, depth=0):
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))

    # Stop criteria
    if depth >= max_depth or n_labels <= 1 or n_samples < min_samples_split:
        leaf_value = np.bincount(y).argmax() # Most common label
        return Node(value=leaf_value)
    
    feature_index, threshold = best_split(X, y)
    if feature_index == None:
        leaf_value = np.bincount(y).argmax() # Most common label
        return Node(value=leaf_value)
    
    X_left, y_left, X_right, y_right = split_data(X, y, feature_index, threshold)
    left_subtree = build_tree(X_left, y_left, max_depth=max_depth, min_samples_split=min_samples_split, depth=depth+1)
    right_subtree = build_tree(X_right, y_right, max_depth=max_depth, min_samples_split=min_samples_split, depth=depth+1)

    return Node(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)


In [48]:
def predict(sample, tree: Node):
    # This is a leaf
    if tree.value != None:
        return tree.value
    
    if sample[tree.feature_index] <= tree.threshold:
        return predict(sample, tree.left)
    else:
        return predict(sample, tree.right) 

In [49]:
def predict_dataset(X, tree):
    return np.array([predict(sample, tree) for sample in X])

In [67]:
def calculate_accuracy(X, y, tree):
    return (predict_dataset(X, tree) == y).astype(int).sum() / len(y)

In [73]:
for i in range(1, 25):
    tree = build_tree(X_train, y_train, max_depth=i, min_samples_split=2)
    train_acc = calculate_accuracy(X_train, y_train, tree)
    test_acc = calculate_accuracy(X_test, y_test, tree)

    print(f"Depth: {i} | Train Accuracy: {train_acc*100:.2f}% | Test Accuracy: {test_acc*100:.2f}%")

Depth: 1 | Train Accuracy: 65.83% | Test Accuracy: 70.00%
Depth: 2 | Train Accuracy: 97.50% | Test Accuracy: 90.00%
Depth: 3 | Train Accuracy: 99.17% | Test Accuracy: 86.67%
Depth: 4 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 5 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 6 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 7 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 8 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 9 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 10 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 11 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 12 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 13 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 14 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 15 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 16 | Train Accuracy: 100.00% | Test Accuracy: 90.00%
Depth: 17 | Train Accuracy: 100.00% | Test Accuracy:

# Conclusion

The decision tree algorithm works steadily on Iris classification. It can achieve full accuracy on train dataset and 90% accuracy on test dataset with a depth of 4. Increased depth doesn't seem to improve performance.

Metrics:
- Highest accuracy achieved on test dataset: 90%