In [2]:
import numpy as np

## Decission Tree

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report


ds = load_breast_cancer()

x_tr, x_test, y_tr, y_test = train_test_split(ds["data"], ds["target"], test_size=0.2, random_state=42, shuffle=True)

### Ручная реализация на numpy

In [13]:
from typing import Optional, Literal


class Node:

    def __init__(
        self,
        feature: int, 
        threshold: np.number,
        probs: Optional[dict] = None,
        left: Optional['Node'] = None,
        right: Optional['Node'] = None,
    ):
        self.feature = feature
        self.threshold = threshold
        self.probs = probs
        self.left = left
        self.right = right


class DecissionTree:

    def __init__(
        self, 
        max_depth: int = 5, 
        min_samples_split: int = 2, 
        min_samples_leaf: int = 2,
        criterion: Literal["gini", "entropy"] = "gini",
    ):
        self.tree = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.classes = []
        if criterion == "gini":
            self.criterion = DecissionTree.gini
        elif criterion == "entropy":
            self.criterion = DecissionTree.entropy
        else:
            raise ValueError(criterion)

    
    @staticmethod
    def entropy(y: np.array):
        _, counts = np.unique(y, return_counts=True)
        p = counts / np.sum(counts)
        
        return -np.sum(p * np.log2(p))
    

    @staticmethod
    def gini(y: np.array):
        _, counts = np.unique(y, return_counts=True)
        p = counts / np.sum(counts)

        return 1 - np.sum(p ** 2)
    

    def information_gain(
        self,
        x: np.array, 
        y: np.array, 
        criterion_idx: int, 
        criterion_threshold: np.number,
    ):
        parent_entropy = self.criterion(y)

        left_y = y[x[:, criterion_idx] < criterion_threshold]
        left_entropy = self.criterion(left_y)
        left_support = len(left_y) / len(y)

        right_y = y[x[:, criterion_idx] >= criterion_threshold]
        right_entropy = self.criterion(right_y)
        right_support = len(right_y) / len(y)

        return parent_entropy - left_support * left_entropy - right_support * right_entropy
    

    def _calc_probs(self, y: np.array):
        classes, counts = np.unique(y, return_counts=True)
        probs_dict = dict(zip(classes, counts / len(y)))

        return [probs_dict.get(c, np.float64(0.)) for c in self.classes]
    

    def split_node(self, x: np.array, y: np.array, depth: int, used: set):
        n_samples, n_features = x.shape

        probs = self._calc_probs(y)

        if (n_samples < self.min_samples_split) or (depth >= self.max_depth):
            return probs
        
        best_info_gain, best_feature, best_threshold = -np.inf, None, None
        for feature in set(range(n_features)) - used:
            for threshold in np.unique(x[:, feature]):
                tmp_x = x[x[:, feature] < threshold]
                if (len(tmp_x) < self.min_samples_leaf) or (len(x) - len(tmp_x) < self.min_samples_leaf):
                    continue

                gain = self.information_gain(
                    x, y, criterion_idx=feature, criterion_threshold=threshold,
                )
                
                if gain > best_info_gain:
                    best_info_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        if best_info_gain in [0, -np.inf]:
            return probs
        
        mask = x[:, best_feature] < best_threshold
        left_node = self.split_node(x[mask], y[mask], depth+1, used | {best_feature})
        right_node = self.split_node(x[~mask], y[~mask], depth+1, used | {best_feature})
        
        return Node(
            feature=best_feature, 
            threshold=best_threshold, 
            probs=probs,
            left=left_node, 
            right=right_node, 
        )


    def fit(self, x: np.array, y: np.array):
        self.classes = sorted(np.unique(y))
        self.classes.sort()
        self.tree = self.split_node(x, y, depth=0, used=set())

    
    def predict_proba(self, x: np.array):
        preds = []
        for x_obj in x:
            node = self.tree 
            while isinstance(node, Node):
                node = node.left if x_obj[node.feature] < node.threshold else node.right
            
            preds.append(node)

        return np.array(preds)
    
    
    def predict(self, x: np.array):
        probs = self.predict_proba(x)
        return np.argmax(probs, axis=-1)

In [14]:
tree = DecissionTree(
    max_depth=3, 
    min_samples_split=10, 
    min_samples_leaf=10,
)
tree.fit(x_tr, y_tr)

In [25]:
print("Результаты на test:\n\n", classification_report(y_test, tree.predict(x_test)))
print("ROC-AUC на test:", roc_auc_score(y_test, tree.predict_proba(x_test)[:, 1]).round(3))

Результаты на test:

               precision    recall  f1-score   support

           0       0.95      0.91      0.93        43
           1       0.95      0.97      0.96        71

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

ROC-AUC на test: 0.967


### Сравнение с sklearn.tree.DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier


sklearn_tree = DecisionTreeClassifier(
    criterion="gini", 
    max_depth=3, 
    min_samples_split=10, 
    min_samples_leaf=10,
)
sklearn_tree.fit(x_tr, y_tr)

In [26]:
print("Результаты на test:\n\n", classification_report(y_test, sklearn_tree.predict(x_test)))
print("ROC-AUC на test:", roc_auc_score(y_test, sklearn_tree.predict_proba(x_test)[:, 1]).round(3))

Результаты на test:

               precision    recall  f1-score   support

           0       0.95      0.91      0.93        43
           1       0.95      0.97      0.96        71

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

ROC-AUC на test: 0.967


### Реализация с pandas

In [1]:
import numpy as np


def enthropy(*probs):
    return -sum(p * np.log2(p) for p in probs)


enthropy(*[0.2 for _ in range(5)]), enthropy(0.15, 0.25, 0.3, 0.1, 0.2)

(np.float64(2.321928094887362), np.float64(2.228212945841001))

In [None]:
from typing import Union
import pandas as pd
import numpy as np


def enthropy(*probs):
    return -sum(p * np.log2(p) for p in probs)


class Node:
    
    def __init__(self, n_classes: int, level: int):
        self.parent = None
        self.childs = []
        
        self.criterion = None
        self.probs = [1/n_classes for _ in range(n_classes)]
        
        self.n_classes = n_classes
        self.level = level

        self._indexes = set()


    def _set_criterion(self, criterion: callable):
        self.criterion = criterion
        self.childs = [Node(self.n_classes, level=self.level+1) for _ in range(2)]

    
    def __call__(self, data: pd.DataFrame, indexes: list = None, train: bool = False):
        if self.childs:
            mask = self.criterion(data)
            self.childs[0](data, data[mask].index, train)
            self.childs[1](data, data[~mask].index, train)
        else:
            if indexes is None:
                indexes = data.index
            
            if train:
                self._indexes = indexes

            data.loc[indexes, "y_"] = np.argmax(self.probs)


class DecissionTree:

    def __init__(self, classes: Union[set, list], max_level: int = 1):
        self.head = Node(len(classes), level=0)
        self.leaves = [self.head]
        
        self.max_level = max_level
        self.classes = classes


    def _add_nodes(self, leaf_index: int, criterion: callable):
        parent = self.leaves.pop(leaf_index)
        parent._set_criterion(criterion)
        self.leaves.extend(parent.childs)

    
    def __call__(self, data: pd.DataFrame):
        data["y_"] = None
        
        self.head(data)

        return data["y_"].tolist()


    def train(self, x: np.array, y: np.array):
        label_mapping = {lab: np.where(y == lab) for lab in self.classes}
        
        data = pd.DataFrame(x)

        while True:
            self.head(data, train=True)

            ways = {}
            for leaf in self.leaves:
                cur_objs = data.iloc[leaf._indexes]
                leaf.probs = [
                    len(cur_objs.iloc[label_mapping[lab]]) / len(leaf._indexes) 
                    for lab in self.classes
                ]
                if leaf.level < self.max_level:
                    entr_main = enthropy(*leaf.probs)
                    for col in data.columns:
                        for val in data[col].unique():
                            mask = cur_objs[col] <= val
                            
                            if len(cur_objs[mask]) in [len(leaf._indexes), 0]:
                                continue
                            
                            tmp_df = cur_objs[mask]
                            n_objs = len(tmp_df)
                            entr_l = n_objs * enthropy(
                                len(tmp_df.iloc[label_mapping[lab]]) / n_objs for lab in self.classes
                            )
                            
                            tmp_df = cur_objs[~mask]
                            n_objs = len(tmp_df)
                            enrt_r = enthropy(
                                len(tmp_df.iloc[label_mapping[lab]]) / n_objs for lab in self.classes
                            )

                            loss = entr_main - (entr_l + enrt_r) / len(leaf._indexes)
                            
                            ways[loss] = (leaf, val, col)

            if len(ways):
                leaf, val, col = ways[max(ways.keys())]
                leaf._set_criterion(lambda x: x[col] <= val)
            else:
                break