In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


def dataframe_entropy(d: pd.DataFrame, column: str):
    return entropy(d[column].value_counts()[0] / len(d))


def entropy(p):
    if p == 0 or p == 1:
        return 0
    ap = 1-p
    return -(p*np.log2(p) + ap*np.log2(ap))


raw_dataset = pd.read_csv("./dogscats-categorical.csv")
raw_dataset.tail()


Unnamed: 0,Ears shape,Face shape,Whiskers,Cat
5,Pointy,Round,Absent,Yes
6,Floppy,Not round,Absent,No
7,Pointy,Round,Absent,Yes
8,Floppy,Round,Absent,No
9,Floppy,Round,Absent,No


In [5]:
dataset = raw_dataset.copy()
dataset = pd.get_dummies(dataset, columns=['Ears shape'], prefix='', prefix_sep='')
dataset.tail()


Unnamed: 0,Face shape,Whiskers,Cat,Floppy,Oval,Pointy
5,Round,Absent,Yes,0,0,1
6,Not round,Absent,No,1,0,0
7,Round,Absent,Yes,0,0,1
8,Round,Absent,No,1,0,0
9,Round,Absent,No,1,0,0


In [6]:
import pprint
import typing


def dt_split(ds: pd.DataFrame, by: str, ignore: list[str] = []):
    winner = ""
    best_reduction_ = -1
    sufficient = False
    BASE_ENTROPY = dataframe_entropy(ds, by)
    subsets: typing.Any = None
    for col in ds.columns:
        if col in ignore or col == by:
            continue
        splits = list(ds.groupby(col))
        entr1 = entr2 = avg = 0
        (dgr1n, dgr1) = splits[0]
        (dgr2n, dgr2) = splits[1] if len(splits) > 1 else ('UNKNOWN', None)
        entr1 = dataframe_entropy(dgr1, by)
        avg = entr1 * len(dgr1)
        if dgr2 is not None:
            entr2 = dataframe_entropy(dgr2, by)
            avg += entr2 * len(dgr2)
        avg /= len(ds)
        reduction = BASE_ENTROPY - avg
        if reduction > best_reduction_:
            best_reduction_ = reduction
            winner: typing.Any = col
            sufficient = avg == 0
            subsets = (dgr1n, entr1, dgr1), (dgr2n, entr2, dgr2)
    return winner, sufficient, subsets


def recursive_split(ds: pd.DataFrame, by: str, ignore: list[str] = [], depth=0):
    winner, sufficient, subsets = dt_split(ds=ds, by=by, ignore=ignore)
    (option1, entropy1, subset1), (option2, entropy2, subset2) = subsets
    options = {
        option1: subset1[by].iloc[0],
        option2: None if subset2 is None else subset2[by].iloc[0]
    }
    if not sufficient:
        if entropy1 > 0:
            options[option1] = recursive_split(ds=subset1, by=by, ignore=[*ignore, winner], depth=depth+1)
        if entropy2 > 0:
            options[option2] = recursive_split(ds=subset2, by=by, ignore=[*ignore, winner], depth=depth+1)
    return {winner: options}


tree = recursive_split(dataset, "Cat")
pprint.pprint(tree)


{'Floppy': {0: {'Face shape': {'Not round': {'Oval': {0: 'No', 1: 'Yes'}},
                               'Round': {'Oval': {0: 'Yes',
                                                  1: {'Whiskers': {'Absent': 'No',
                                                                   'Present': 'Yes'}}}}}},
            1: 'No'}}
