In [3]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('synthetic_credit_card_approval.csv')

# Features and target
X = df.drop('Target', axis=1).values
y = df['Target'].values

# Train-test split
np.random.seed(42)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
train_size = int(0.7 * len(indices))
train_idx, test_idx = indices[:train_size], indices[train_size:]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

class DecisionTreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifierScratch:
    def __init__(self, max_depth=3, min_samples_split=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        print("Starting tree construction...")
        self.root = self._build_tree(X, y)
        print("Tree construction completed.")

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        num_classes = len(set(y))

        if depth >= self.max_depth or n_samples < self.min_samples_split or num_classes == 1:
            leaf_value = self._most_common_label(y)
            if depth <= 2:
                print(f"{'  '*depth}Creating leaf node with class {leaf_value}")
            return DecisionTreeNode(value=leaf_value)

        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            leaf_value = self._most_common_label(y)
            if depth <= 2:
                print(f"{'  '*depth}No good split found, creating leaf node with class {leaf_value}")
            return DecisionTreeNode(value=leaf_value)

        if depth <= 2:
            print(f"{'  '*depth}Splitting on feature {best_feature} at threshold {best_threshold:.4f}")

        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionTreeNode(feature_index=best_feature, threshold=best_threshold, left=left, right=right)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_gini = float('inf')
        best_feature, best_threshold = None, None

        for feature_index in range(n_features):
            # Sample thresholds using percentiles to reduce computation
            thresholds = np.percentile(X[:, feature_index], [25, 50, 75])
            for threshold in thresholds:
                left_indices = X[:, feature_index] < threshold
                right_indices = X[:, feature_index] >= threshold

                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                gini = self._gini(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def _gini(self, left_labels, right_labels):
        m = len(left_labels) + len(right_labels)
        gini_left = 1.0 - sum((np.sum(left_labels == c) / len(left_labels)) ** 2 for c in np.unique(left_labels))
        gini_right = 1.0 - sum((np.sum(right_labels == c) / len(right_labels)) ** 2 for c in np.unique(right_labels))
        return (len(left_labels) / m) * gini_left + (len(right_labels) / m) * gini_right

    def _most_common_label(self, y):
        labels, counts = np.unique(y, return_counts=True)
        return labels[np.argmax(counts)]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] < node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == y_true)
        tp = np.sum((y_true == 1) & (y_pred == 1))
        tn = np.sum((y_true == 0) & (y_pred == 0))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        precision = tp / (tp + fp + 1e-15)
        recall = tp / (tp + fn + 1e-15)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print("Confusion Matrix:")
        print(f"TP: {tp}, FP: {fp}")
        print(f"FN: {fn}, TN: {tn}")


# ------------------------
# Run the tree classifier
# ------------------------
tree_model = DecisionTreeClassifierScratch(max_depth=3, min_samples_split=10)
tree_model.fit(X_train, y_train)

print("\nEvaluating on test data:")
tree_model.evaluate(X_test, y_test)


Starting tree construction...
Splitting on feature 2 at threshold 72101.0000
  Splitting on feature 2 at threshold 62753.0000
    Creating leaf node with class 0
    Splitting on feature 4 at threshold 1.0000
  Splitting on feature 2 at threshold 81439.0000
    Splitting on feature 1 at threshold 1.0000
    Splitting on feature 2 at threshold 88387.0000
Tree construction completed.

Evaluating on test data:
Accuracy: 0.9364
Precision: 0.8734
Recall: 0.9765
Confusion Matrix:
TP: 56444, FP: 8184
FN: 1360, TN: 84012
