# ID3 Decision Tree

## Условие
* Реализирайте алгоритъма за класификационно дърво ID3
* Използвайте ***кросвалидация*** за изчисляване на точността на модела върху обучаващото множество. 

* За избягване на ***overfitting*** използвайте константа **K** -- минимален брой на обучаващи примери в множеството. 

? друг подход за избягване на ***overfittting*** + сравняване на резултата?  
? bonus: ***Random Forest*** ?

In [165]:
import math
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo, dotdict
from typing import Dict, List, Set, Tuple
from dataclasses import dataclass, field

In [177]:
# fetch dataset 

breast_cancer: dotdict = fetch_ucirepo(id=14)

X: pd.core.frame.DataFrame = breast_cancer.data.features 
y: pd.core.frame.DataFrame = breast_cancer.data.targets

values: List[str] = [X[name].unique() for idx, name in enumerate(X.columns)]
attributes: List[bool] = [False for _ in X.columns] 


X: np.ndarray = X.values
y: np.ndarray = y.values
pos_target: str = "recurrence-events"
# print(attributes_left)
# print(values)

In [178]:

Split = Dict[str, Data]

class Data:
    def __init__(self, feature: int = -1, feature_val: str=None, data: List[int] = None):
        self.num_pos: int = 0
        self.feature: int = feature
        self.feature_val: str = feature_val
        self.data: List[int] = data if data else []
        self._count_pos()

    def _count_pos(self) -> None:
        for idx in self.data:
            if y[idx] == pos_target:
                self.num_pos += 1

    @property
    def prop_pos(self) -> float:
        return self.num_pos / len(self.data)
    @property
    def zero_entropy(self) -> bool:
        return self.num_pos == len(self) or self.num_pos == 0 
    
    @property
    def entropy(self) -> float:
        if self.zero_entropy:
            return 0

        num_neg: int = (len(self) - self.num_pos) 
        prop_neg: float = num_neg / len(self)

        return -self.num_pos * math.log(self.prop_pos) - num_neg * math.log(prop_neg)

    @property
    def gini(self) -> float:
        pass

    def __len__(self) -> int:
        return len(self.data)

    def insert_idx(self, idx: int) -> None:
        self.data.append(idx)
        if y[idx] == pos_target:
            self.num_pos += 1

    def split_on_attribute(self, feature: int) -> Split:
        spl: Split = {val: Data(feature=feature, feature_val=val) for val in values[feature]}
        
        for idx in self.data:
            val: str = X[idx][feature]
            spl[val].insert_idx(idx)

        return spl

    def __repr__(self) -> str:
        return repr(self.data)

dt = Data(data=list(range(len(X))))
spl: Split = dt.split_on_attribute(0)

# print(type(spl))
# print(dt.data)
# print(dt.num_pos)
# print(dt.prop_pos)
# print(dt.entropy)
# for key, value in spl.items():
#     print(f"{key}: {value.entropy}")


False


In [188]:
class Node:

    def __init__(self, data: Data=Data) -> None:
        self.data: Data = data
        self.children: Dict[str, Node] = {}
        self._build_children()
    
    def information_gain(self, spl: Split) -> float:
        sum_term: float = sum([(len(val) / len(self.data)) * val.entropy for _, val in spl.items()])
        return self.data.entropy - sum_term

    def get_best_split(self) -> Tuple[int, Split]:
        best: Split = {}
        best_score: float = float("-inf")
        feature: int = -1
        for i in range(len(attributes)):
            if not attributes[i]:
                spl: Split = self.data.split_on_attribute(i)
                ig: float = self.information_gain(spl)
                if ig > best_score:
                    best_score = ig
                    best = spl
                    feature = i
        return feature, best

    def predict(self, X: np.ndarray) -> bool:
        if not self.children:
            pass



    def _build_children(self) -> None:
        # print(sum(attributes) == len(attributes), sum(attributes) == len(attributes))
        if self.data.zero_entropy or sum(attributes) == len(attributes):
            # print("42")
            return
        feature, best_split = self.get_best_split()
        attributes[feature] = True
    
        for val, data in best_split.items():
            self.children[val] = Node(data)


attributes = [False for _ in attributes]
n = Node(data=Data(data=list(range(len(X)))))
# print("age", n.information_gain(spl))
# print(n.get_best_split())
print(n.children)

{'30-34': <__main__.Node object at 0x7f19fadb0820>, '20-24': <__main__.Node object at 0x7f19f8901c00>, '15-19': <__main__.Node object at 0x7f19f8903d00>, '0-4': <__main__.Node object at 0x7f19f89001f0>, '25-29': <__main__.Node object at 0x7f19f8901570>, '50-54': <__main__.Node object at 0x7f19f8901d80>, '14-Oct': <__main__.Node object at 0x7f19f8901db0>, '40-44': <__main__.Node object at 0x7f19f8903970>, '35-39': <__main__.Node object at 0x7f19f89031c0>, '9-May': <__main__.Node object at 0x7f19f8903100>, '45-49': <__main__.Node object at 0x7f19f91096c0>}


In [None]:
class DecisionTree:
    def __init__(self) -> None:
        self.root: Node = Node(Data(data=List(range(len(X)))))
        
        

In [52]:

def split_on_attribute(data: List[int], feature: int) -> Dict[str, List[int]]:
    """
    ::Returns:: an Dict of arrays of indeces of rows
    with keys feature vals
    """

    lst_spl: Dict[str, List[int]] = {val:Data for val in values[feature]}

    for idx in data:
        lst_spl[X[idx][feature]][0].append(idx)
        if y[idx] == pos_target:
            lst_spl[X[idx][feature]][1] += 1

    return lst_spl

def get_best_split(data: List[int]) -> Dict[str, List[int]]:
    pass
    
print(split_on_attribute(list(range(len(X))), 0))

class Node:
    def __init__(self):
        self.data: np.ndarray = []


{'30-39': [[0, 22, 35, 51, 74, 77, 90, 99, 107, 116, 118, 126, 127, 135, 136, 138, 148, 154, 170, 171, 193, 205, 219, 221, 224, 232, 238, 245, 246, 249, 252, 265, 268, 276, 281, 282], 15], '40-49': [[1, 2, 4, 8, 9, 10, 14, 16, 25, 28, 38, 39, 43, 46, 48, 49, 60, 61, 64, 65, 66, 71, 79, 84, 89, 95, 98, 100, 103, 104, 105, 106, 108, 112, 113, 114, 117, 129, 130, 131, 132, 137, 139, 143, 145, 159, 160, 161, 165, 166, 167, 169, 175, 176, 179, 180, 181, 182, 185, 188, 190, 191, 197, 202, 209, 210, 212, 215, 217, 220, 225, 226, 231, 236, 237, 239, 241, 244, 250, 251, 254, 255, 256, 257, 261, 266, 269, 272, 275, 284], 27], '60-69': [[3, 5, 7, 12, 15, 18, 21, 27, 29, 41, 42, 58, 59, 69, 70, 72, 81, 82, 86, 93, 96, 97, 109, 119, 120, 124, 134, 142, 144, 146, 151, 153, 155, 162, 163, 164, 172, 177, 187, 200, 214, 218, 222, 223, 228, 229, 234, 240, 248, 253, 260, 262, 267, 273, 274, 277, 283], 17], '50-59': [[6, 11, 13, 17, 19, 20, 23, 24, 26, 30, 31, 32, 33, 34, 36, 37, 40, 44, 45, 47, 50, 52, 5