# ID3 Decision Tree

## Условие
* Реализирайте алгоритъма за класификационно дърво ID3
* Използвайте ***кросвалидация*** за изчисляване на точността на модела върху обучаващото множество. 

* За избягване на ***overfitting*** използвайте константа **K** -- минимален брой на обучаващи примери в множеството. 

? друг подход за избягване на ***overfittting*** + сравняване на резултата?  
? bonus: ***Random Forest*** ?

In [6]:
import math
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo, dotdict
from typing import Dict, List, Set, Tuple
from dataclasses import dataclass, field

In [7]:
# fetch dataset 

breast_cancer: dotdict = fetch_ucirepo(id=14)

X: pd.core.frame.DataFrame = breast_cancer.data.features 
y: pd.core.frame.DataFrame = breast_cancer.data.targets

values: List[str] = [X[name].unique() for idx, name in enumerate(X.columns)]
attributes: List[bool] = [False for _ in X.columns] 


X: np.ndarray = X.values
y: np.ndarray = y.values
pos_target: str = "recurrence-events"

# print(attributes_left)
# print(values)

In [8]:

Split = Dict[str, "Data"]

class Data:
    def __init__(self, 
                 y_train,
                 feature: int = -1, 
                 feature_val: str=None, 
                 data: List[int] = None):
        self.num_pos: int = 0
        self.feature: int = feature
        self.feature_val: str = feature_val
        self.data: List[int] = data if data else []
        self._count_pos(y_train)

    def _count_pos(self, y_train: np.ndarray) -> None:
        for idx in self.data:
            if y_train[idx] == pos_target:
                self.num_pos += 1
    @property
    def prop_predict(self) -> bool:
        return self.prop_pos > 0.5

    @property
    def prop_pos(self) -> float:
        return self.num_pos / len(self.data)
    @property
    def zero_entropy(self) -> bool:
        return self.num_pos == len(self) or self.num_pos == 0 
    
    @property
    def full_entropy(self) -> bool:
        return self.num_pos == (len(self) - self.num_pos) 
    
    @property
    def entropy(self) -> float:
        if self.zero_entropy:
            return 0

        num_neg: int = (len(self) - self.num_pos) 
        prop_neg: float = num_neg / len(self)

        return -self.num_pos * math.log(self.prop_pos) - num_neg * math.log(prop_neg)

    @property
    def gini(self) -> float:
        pass

    def __len__(self) -> int:
        return len(self.data)

    def insert_idx(self, idx: int, 
                   y_train: np.ndarray) -> None:
        self.data.append(idx)
        if y_train[idx] == pos_target:
            self.num_pos += 1

    def split_on_attribute(self, feature: int, 
                           X_train: np.ndarray, 
                           y_train: np.ndarray) -> Split:
        spl: Split = {val: Data(feature=feature, 
                                feature_val=val, 
                                y_train=y_train) for val in values[feature]}
        
        for idx in self.data:
            val: str = X_train[idx][feature]
            spl[val].insert_idx(idx, y_train)

        return spl

    def __repr__(self) -> str:
        return repr(self.data)

dt = Data(data=list(range(len(X))), y_train=y)
spl: Split = dt.split_on_attribute(0, X, y_train=y)

# print(type(spl))
# print(dt.data)
# print(dt.num_pos)
# print(dt.prop_pos)
# print(dt.entropy)
# for key, value in spl.items():
#     print(f"{key}: {value.entropy}")


In [9]:
class Node:

    def __init__(self, X_train, y_train, data: Data=Data) -> None:
        self.data: Data = data
        self.children: Dict[str, Node] = {}
        self.attribute: int = -1
        self._build_children(X_train, y_train)
    
    def information_gain(self, spl: Split) -> float:
        sum_term: float = sum([(len(val) / len(self.data)) * val.entropy for _, val in spl.items()])
        return self.data.entropy - sum_term

    def get_best_split(self, X_train: np.ndarray, y_train: np.ndarray) -> Tuple[int, Split]:
        best: Split = {}
        best_score: float = float("-inf")
        feature: int = -1
        for i in range(len(attributes)):
            if not attributes[i]:
                spl: Split = self.data.split_on_attribute(i, X_train, y_train)
                ig: float = self.information_gain(spl)
                if ig > best_score:
                    best_score = ig
                    best = spl
                    feature = i
        return feature, best

    @property
    def leaf(self) -> bool:
        return False if self.children else True

    def predict(self, X: np.ndarray) -> bool:
        if not self.children:
            return self.data.prop_predict
        
        for val, child in self.children.items():
            if X[self.attribute] == val:
                if child.leaf and child.data.full_entropy:
                    return self.data.prop_predict
                return child.predict(X)



    def _build_children(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        # print(sum(attributes) == len(attributes), sum(attributes) == len(attributes))
        if self.data.zero_entropy or sum(attributes) == len(attributes):
            # print("42")
            return
        feature, best_split = self.get_best_split(X_train, y_train)
        attributes[feature] = True
        self.attribute = feature
    
        for val, data in best_split.items():
            self.children[val] = Node(data=data, X_train=X_train, y_train=y_train)


attributes = [False for _ in attributes]
n = Node(X_train=X[100:], y_train=y[100:], data=Data(data=list(range(len(X[100:]))), y_train=y[100:]))

# for i in range(len(X)):


# print("age", n.information_gain(spl))
# print(n.get_best_split(X_train=X_train))
# print(n.children)




## Measuring Accuracy

In [52]:
from sklearn.model_selection import  train_test_split, KFold

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# attributes = [False for _ in attributes]
# n = Node(X_train, y_train, data=Data(data=list(range(len(X_train))), y_train=y_train))


kf = KFold(n_splits=10, shuffle=True)

# mistakes = 0
# for i in range(len(X[:100])):
#     act = y[i] == pos_target
#     if n.predict(X[i]) != act:
#         mistakes += 1

# print(1 - mistakes / len(X))

accs = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    attributes = [False for _ in attributes]
    n = Node(X_train=X_train, y_train=y_train, data=Data(data=list(range(len(X_train))), y_train=y_train))

    mistakes = 0
    for i in range(len(X_test)):
        temp = y_test[i] == pos_target
        if temp != n.predict(X_test[i]):
            mistakes += 1
    accs.append(1 - mistakes / len(X_test)) 

sum(accs) / len(accs)


# # act = y == pos_target
# mistakes = 0
# for i in range(len(X[:30])):
#     if(n.predict(X[i]) != act[i]):
#         mistakes += 1
# print(1 - mistakes / len(X))

0.6567733990147783