In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 
  
# metadata 
#print(breast_cancer.metadata) 
  
# variable information 
#print(breast_cancer.variables) 

In [11]:
print(X.shape)
print(y.shape)

(286, 9)
(286, 1)


In [73]:
import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def build(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([ self._traverse_tree(row, self.tree) for index, row in X.iterrows()])

    def _gini(self, y):
        classes = y.value_counts(normalize=True)
        return 1 - np.sum(classes**2)

    def _split(self, X, y, feature_index, threshold):
        left_mask = X[feature_index] < threshold
        right_mask = ~left_mask
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

    def _best_split(self, X: pd.DataFrame, y: pd.Series):
        best_gini = float("inf")
        best_split = None
    
        for feature in X.columns:
            thresholds = X[feature].unique()
            for threshold in thresholds:
                
                X_left, y_left, X_right, y_right = self._split(X, y, feature, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
    
                gini_left = self._gini(y_left)
                gini_right = self._gini(y_right)
                weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)
    
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_split = (feature, threshold)
    
        return best_split

    def _build_tree(self, X, y, depth):
        if len(y.iloc[:, 0].unique()) == 1 or depth == self.max_depth or len(y) == 0:
            return {"leaf": True, "value": y.iloc[:, 0].mode()[0]}
    
        split = self._best_split(X, y)
        
        if not split:
            return {"leaf": True, "value": y.iloc[:, 0].mode()[0]}
    
        feature, threshold = split
    
        left_mask = X[feature] < threshold
        right_mask = ~left_mask
    
        X_left, y_left = X.loc[left_mask], y.loc[left_mask]
        X_right, y_right = X.loc[right_mask], y.loc[right_mask]
    
        return {
            "leaf": False,
            "feature": feature,
            "threshold": threshold,
            "left": self._build_tree(X_left, y_left, depth + 1),
            "right": self._build_tree(X_right, y_right, depth + 1),
        }

    def _traverse_tree(self, x, node):
        # try:
        if node["leaf"]:
            return node["value"]

        if isinstance(x[node["feature"]], (int, float)) and np.isnan(x[node["feature"]]) or x[node["feature"]] < node["threshold"]:
            return self._traverse_tree(x, node["left"])
        else:
            return self._traverse_tree(x, node["right"])
        # except KeyError as e:
        #     print(f"KeyError: Missing key {e} in the node. Node: {node}")
        # except Exception as e:
        #     print(f"Error during tree traversal: {e}", node["feature"], x[node["feature"]], node["threshold"])    

In [78]:
from sklearn.model_selection import KFold

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy = np.array([])

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    tree = DecisionTree(max_depth=5)
    tree.build(X_train, y_train)

    predictions = tree.predict(X_test)
    guessed = np.sum(predictions == (y_test["Class"].to_numpy()))
    accuracy = np.append(accuracy, guessed / len(y_test))

    print(f"Fold {fold}: ",guessed / len(y_test))

print(np.mean(accuracy))

Fold 1:  0.5517241379310345
Fold 2:  0.6896551724137931
Fold 3:  0.7586206896551724
Fold 4:  0.5862068965517241
Fold 5:  0.5862068965517241
Fold 6:  0.6206896551724138
Fold 7:  0.6785714285714286
Fold 8:  0.7142857142857143
Fold 9:  0.75
Fold 10:  0.8571428571428571
[0.55172414 0.68965517 0.75862069 0.5862069  0.5862069  0.62068966
 0.67857143 0.71428571 0.75       0.85714286]
0.6793103448275862
