# Árvore de decisão - Heurística para escolher o melhor valor de cada característica

## Definir implementações

### Funções utilitárias

In [1]:
from collections import Counter
from typing import Tuple, Any

from numpy import ndarray


def most_frequently(y: ndarray):
    return Counter(y.flat).most_common(1)[0][0]


def gini_impurity(y: ndarray):
    labels = sorted(set(y))
    y_len = len(y)
    probabilities = numpy.zeros(shape=(len(labels),))

    for index, key in enumerate(labels):
        probabilities[index] = sum(y == key) / y_len
    return 1 - sum(probabilities ** 2)


def gini_impurity_by_value(x: ndarray, y: ndarray, value):
    same = x == value
    impurity_eq = gini_impurity(y[same])
    probability_eq = sum(same) / len(y)
    impurity_diff = gini_impurity(y[~same])
    probability_diff = sum(~same) / len(y)
    return impurity_eq * probability_eq + impurity_diff * probability_diff


def gini_min_impurity(x: ndarray, y: ndarray) -> Tuple[float, int, Any]:
    impurities = []
    features_values = []
    features_index = list(range(x.shape[1]))

    for i in features_index:
        values = sorted(set(x[:, i]))

        for value in values:
            features_values.append([i, value])
            impurity_value_feat = gini_impurity_by_value(x[:, i], y, value)
            impurities.append(impurity_value_feat)

    features_values = numpy.array(features_values)
    min_impurity = numpy.argmin(impurities)
    feature, value = features_values[min_impurity]
    return impurities[min_impurity], int(feature), value

### Heurística escolhida

### Implementação do professor

In [2]:
import numpy
from sklearn.base import BaseEstimator, ClassifierMixin


class Tree(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.feature = 0

    def fit(self, x: ndarray, y: ndarray):
        self.value = numpy.mean(x[:, self.feature])
        greater = x[:, self.feature] > self.value

        if sum(greater) > 0 and sum(~greater) > 0:
            self.greater = Tree()
            self.greater.fit(x[greater, :], y[greater])
            self.lte = Tree()
            self.lte.fit(x[~greater, :], y[~greater])

        else:
            self.answer = most_frequently(y)

    def predict(self, x: ndarray):
        y = numpy.empty((x.shape[0]))

        if hasattr(self, 'answer'):
            y[:] = self.answer

        else:
            greater = x[:, self.feature] > self.value
            y[greater] = self.greater.predict(x[greater, :])
            y[~greater] = self.lte.predict(x[~greater, :])

        return y

### Implementação do Scikit-Learn

## Datasets

### Iris Dataset

In [3]:
from sklearn.datasets import load_iris

iris_dataset = load_iris()
iris_x, iris_y = iris_dataset.data[:, 2:], iris_dataset.target

## Comparação: Regiões de decisão

In [None]:
# TODO: finalizar

## Comparação

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier

test_score_key = 'test_score'

zero_r = DummyClassifier(strategy="most_frequent")
zero_r.fit(iris_x, iris_y)
zero_r_predict = zero_r.predict(iris_x)
model = DummyClassifier(strategy="most_frequent")
zero_r_cross_val = cross_validate(model, iris_x, iris_y)[test_score_key]

tree = Tree()
tree.fit(iris_x, iris_y)
tree_predict = tree.predict(iris_x)
model = Tree()
tree_cross_val = cross_validate(model, iris_x, iris_y)[test_score_key]

In [5]:
from pandas import DataFrame

iris_x = {
    'Acurácia': [
        accuracy_score(iris_y, zero_r_predict),
        accuracy_score(iris_y, tree_predict)
    ],
    'Validação cruzada (média)': [
        numpy.mean(zero_r_cross_val),
        numpy.mean(tree_cross_val)
    ]
}

columns = ['Zero R', 'Árvore (prof.)']
DataFrame.from_dict(iris_x, orient='index', columns=columns)

Unnamed: 0,Zero R,Árvore (prof.)
Acurácia,0.333333,0.953333
Validação cruzada (média),0.333333,0.92
