1. scratch

In [3]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(np.array(X), np.array(y), depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        print(X.shape)
        unique_labels = np.unique(y)

        # Stopping conditions
        if len(unique_labels) == 1:
            return unique_labels[0]
        if self.max_depth is not None and depth >= self.max_depth:
            return Counter(y).most_common(1)[0][0]
        if num_samples == 0:
            return None

        # Find best split
        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            return Counter(y).most_common(1)[0][0]

        # Split dataset
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = X[:, best_feature] > best_threshold

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            "feature": best_feature,
            "threshold": best_threshold,
            "left": left_subtree,
            "right": right_subtree
        }

    def _best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None
        current_entropy = self._entropy(y)

        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = X[:, feature_index] > threshold
                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue

                gain = self._information_gain(y, y[left_mask], y[right_mask], current_entropy)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def _entropy(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probs if p > 0])

    def _information_gain(self, parent, left, right, parent_entropy):
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        return parent_entropy - (weight_left * self._entropy(left) + weight_right * self._entropy(right))

    def predict(self, X):
        return [self._predict_point(x, self.tree) for x in np.array(X)]

    def _predict_point(self, x, tree):
        if not isinstance(tree, dict):
            return tree

        feature = tree["feature"]
        print('f', feature)
        threshold = tree["threshold"]
        print('t', threshold)

        if x[feature] <= threshold:
            return self._predict_point(x, tree["left"])
        else:
            return self._predict_point(x, tree["right"])

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train & Predict
clf = DecisionTree(max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))


(105, 4)
(36, 4)
(69, 4)
(34, 4)
(30, 4)
(4, 4)
(35, 4)
(4, 4)
(31, 4)
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 3
t 1.6
f 2
t 4.8
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 2
t 1.9
f 3
t 1.6
f 2
t 4.9
f 2
t 1.9
f 3
t 1.6

2. sklearn

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train model
clf = DecisionTreeClassifier(max_depth=3,criterion="entropy")
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9555555555555556
