In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


def dataframe_entropy(d: pd.DataFrame, column: str):
    return entropy(d[column].value_counts()[0] / len(d))


def entropy(p):
    if p == 0 or p == 1:
        return 0
    ap = 1-p
    return -(p*np.log2(p) + ap*np.log2(ap))


raw_dataset = pd.read_csv("./dogscats-categorical.csv")
dataset = raw_dataset.copy()
dataset = pd.get_dummies(dataset, columns=['Ears shape'], prefix='', prefix_sep='')
dataset.tail()


Unnamed: 0,Face shape,Whiskers,Cat,Floppy,Oval,Pointy
5,Round,Absent,Yes,0,0,1
6,Not round,Absent,No,1,0,0
7,Round,Absent,Yes,0,0,1
8,Round,Absent,No,1,0,0
9,Round,Absent,No,1,0,0


In [72]:
import pprint
import typing


def dt_split(dataframe: pd.DataFrame, labels_column: str, ignore_columns: list[str] | None = None):
    most_informative_feature: typing.Any = ""
    best_info_gain = -1
    BASE_ENTROPY = dataframe_entropy(dataframe, labels_column)
    subsets: typing.Any = None
    for column in dataframe.columns:
        if column == labels_column:
            continue
        if ignore_columns is not None and column in ignore_columns:
            continue
        splits = list(dataframe.groupby(column))
        (value_left, subset_left) = splits[0]
        (value_right, subset_right) = splits[1] if len(splits) > 1 else ('UNKNOWN', None)
        entropy_left = dataframe_entropy(subset_left, labels_column)
        entropy_right = dataframe_entropy(subset_right, labels_column) if subset_right is not None else 0
        weight_left = len(subset_left) / len(dataframe)
        weight_right = len(subset_right) / len(dataframe) if subset_right is not None else 0
        weighted_entropy = weight_left * entropy_left + weight_right * entropy_right
        info_gain = BASE_ENTROPY - weighted_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            most_informative_feature = column
            subsets = (value_left, entropy_left, subset_left), (value_right, entropy_right, subset_right)
    return most_informative_feature, subsets


def most_common_value(dataframe: pd.DataFrame, column: str):
    return dataframe[column].mode()[0]


def recursive_split(dataframe: pd.DataFrame, labels_column: str, ignore_columns: list[str] | None = None, depth=0, max_depth=3):
    most_informative_feature, subsets = dt_split(dataframe=dataframe, labels_column=labels_column, ignore_columns=ignore_columns)
    next_ignore_columns = [*ignore_columns, most_informative_feature] if ignore_columns is not None else None
    (value_left, entropy_left, subset_left), (value_right, entropy_right, subset_right) = subsets
    options = {
        value_left: subset_left[labels_column].iloc[0],
        value_right: subset_right[labels_column].iloc[0] if subset_right is not None else None
    }
    if entropy_left > 0:
        if depth < max_depth:
            options[value_left] = recursive_split(dataframe=subset_left, labels_column=labels_column, ignore_columns=next_ignore_columns, depth=depth+1, max_depth=max_depth)
        else:
            options[value_left] = most_common_value(dataframe=subset_left, column=labels_column)

    if entropy_right > 0:
        if depth < max_depth:
            options[value_right] = recursive_split(dataframe=subset_right, labels_column=labels_column, ignore_columns=next_ignore_columns, depth=depth+1, max_depth=max_depth)
        else:
            options[value_right] = most_common_value(dataframe=subset_right, column=labels_column)

    return {most_informative_feature: options}

class Tree():
    def __init__(self, schema) -> None:
        self.schema = schema

    def propagate(self, dataframe: pd.DataFrame, index: int):
        further_tree = self.schema
        while True:
            current_feature = list(further_tree.keys())[0]
            current_value = dataframe[current_feature][index]
            next_node = further_tree[current_feature][current_value]
            if type(next_node) == dict:
                further_tree = next_node
            else:
                return next_node


class Ensemble():
    def __init__(self, schemas) -> None:
        self.schemas = schemas
        self.trees: list[Tree] = []
        for schema in schemas:
            self.trees.append(Tree(schema=schema))

    def vote(self, dataframe: pd.DataFrame, index: int):
        results = {}
        for tree in self.trees:
            vote = tree.propagate(dataframe=dataframe, index=index)
            if vote in results:
                results[vote] += 1
            else:
                results[vote] = 1
        return results
    
    def predict(self, dataframe: pd.DataFrame, index: int):
        vote = self.vote(dataframe=dataset, index=index)
        keys = list(vote.keys())
        top_vote = None
        most_votes = -1
        for key in keys:
            if vote[key] > most_votes:
                most_votes = vote[key]
                top_vote = key
        return top_vote

    def predict_all(self, dataframe: pd.DataFrame):
        results = []
        for i in range(len(dataset)):
            result = self.predict(dataframe=dataframe, index=i)
            results.append(result)
        return results




def construct_random_forest(dataframe: pd.DataFrame, labels_column: str, max_depth: int, population: int):
    schemas = []
    for i in range(population):
        resampled_dataframe = dataframe.sample(frac=1.0, replace=True)
        schema = recursive_split(dataframe=resampled_dataframe, labels_column=labels_column, max_depth=max_depth)
        schemas.append(schema)
    return Ensemble(schemas=schemas)

forest = construct_random_forest(dataset, "Cat", 1, 100)
results = forest.predict_all(dataframe=dataset)
(dataset["Cat"] == results).value_counts(normalize=True)[True]


0.9