## Decission Tree

In [1]:
import numpy as np


def enthropy(*probs):
    return -sum(p * np.log2(p) for p in probs)


enthropy(*[0.2 for _ in range(5)]), enthropy(0.15, 0.25, 0.3, 0.1, 0.2)

(np.float64(2.321928094887362), np.float64(2.228212945841001))

In [None]:
from typing import Union
import pandas as pd
import numpy as np


def enthropy(*probs):
    return -sum(p * np.log2(p) for p in probs)


class Node:
    
    def __init__(self, n_classes: int, level: int):
        self.parent = None
        self.childs = []
        
        self.criterion = None
        self.probs = [1/n_classes for _ in range(n_classes)]
        
        self.n_classes = n_classes
        self.level = level

        self._indexes = set()


    def _set_criterion(self, criterion: callable):
        self.criterion = criterion
        self.childs = [Node(self.n_classes, level=self.level+1) for _ in range(2)]

    
    def __call__(self, data: pd.DataFrame, indexes: list = None, train: bool = False):
        if self.childs:
            mask = self.criterion(data)
            self.childs[0](data, data[mask].index, train)
            self.childs[1](data, data[~mask].index, train)
        else:
            if indexes is None:
                indexes = data.index
            
            if train:
                self._indexes = indexes

            data.loc[indexes, "y_"] = np.argmax(self.probs)


class DecissionTree:

    def __init__(self, classes: Union[set, list], max_level: int = 1):
        self.head = Node(len(classes), level=0)
        self.leaves = [self.head]
        
        self.max_level = max_level
        self.classes = classes


    def _add_nodes(self, leaf_index: int, criterion: callable):
        parent = self.leaves.pop(leaf_index)
        parent._set_criterion(criterion)
        self.leaves.extend(parent.childs)

    
    def __call__(self, data: pd.DataFrame):
        data["y_"] = None
        
        self.head(data)

        return data["y_"].tolist()


    def train(self, x: np.array, y: np.array):
        label_mapping = {lab: np.where(y == lab) for lab in self.classes}
        
        data = pd.DataFrame(x)

        while True:
            self.head(data, train=True)

            ways = {}
            for leaf in self.leaves:
                cur_objs = data.iloc[leaf._indexes]
                leaf.probs = [
                    len(cur_objs.iloc[label_mapping[lab]]) / len(leaf._indexes) 
                    for lab in self.classes
                ]
                if leaf.level < self.max_level:
                    entr_main = enthropy(*leaf.probs)
                    for col in data.columns:
                        for val in data[col].unique():
                            mask = cur_objs[col] <= val
                            
                            if len(cur_objs[mask]) in [len(leaf._indexes), 0]:
                                continue
                            
                            tmp_df = cur_objs[mask]
                            n_objs = len(tmp_df)
                            entr_l = n_objs * enthropy(
                                len(tmp_df.iloc[label_mapping[lab]]) / n_objs for lab in self.classes
                            )
                            
                            tmp_df = cur_objs[~mask]
                            n_objs = len(tmp_df)
                            enrt_r = enthropy(
                                len(tmp_df.iloc[label_mapping[lab]]) / n_objs for lab in self.classes
                            )

                            loss = entr_main - (entr_l + enrt_r) / len(leaf._indexes)
                            
                            ways[loss] = (leaf, val, col)

            if len(ways):
                leaf, val, col = ways[max(ways.keys())]
                leaf._set_criterion(lambda x: x[col] <= val)
            else:
                break