In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


def log2(p):
    return np.ma.log2(p).filled(0)


def dataframe_entropy(d: pd.DataFrame, column: str):
    return entropy_singular(d[column].value_counts()[0] / len(d))


def entropy(p):
    ap = 1-p
    return -(p*log2(p) + ap*log2(ap))


def entropy_singular(p):
    if p == 0 or p == 1:
        return 0
    ap = 1-p
    return -(p*np.log2(p) + ap*np.log2(ap))


raw_dataset = pd.read_csv("./dogscats.csv")
raw_dataset.tail()


In [None]:
dataset = raw_dataset.copy()


def dt_split(ds: pd.DataFrame, by: str, ignore: list[str] = [], verbose=False):
    uniques = ds[by].unique()
    ruler = uniques[0]
    BASE_ENTROPY = dataframe_entropy(ds, by)
    if verbose:
        print(f"Unique values in {by}: {uniques}")
        print(f"BASE ENTROPY: {BASE_ENTROPY}")
    SPLIT_ENTROPIES_VERBOSE = {}
    SPLIT_ENTROPIES = []
    for col in ds.columns[:-1]:
        if col in ignore:
            continue
        (dgr1n, dgr1), (dgr2n, dgr2) = ds.groupby(col)
        entr1 = dataframe_entropy(dgr1, by)
        entr2 = dataframe_entropy(dgr2, by)
        avg = (entr1 * len(dgr1) + entr2 * len(dgr2)) / len(ds)
        reduction = BASE_ENTROPY - avg
        SPLIT_ENTROPIES.append([col, reduction])
        SPLIT_ENTROPIES_VERBOSE[col] = (entr1, entr2, avg, reduction)
        if verbose:
            print("--------------------------")
            print(f"{col}:")
            vc = dgr1[by].value_counts()
            print(f"{vc[ruler] if ruler in vc else 0}/{len(dgr1)} {dgr1n} --> E {entr1}")
            vc = dgr2[by].value_counts()
            print(f"{vc[ruler] if ruler in vc else 0}/{len(dgr2)} {dgr2n} --> E {entr2}")
            print(f"A {avg}")
            print(f"R {reduction}")
            print("--------------------------")
    SPLIT_ENTROPIES = np.array(SPLIT_ENTROPIES)
    winner = SPLIT_ENTROPIES[np.argmax(SPLIT_ENTROPIES[:,1])][0]
    sufficient = SPLIT_ENTROPIES_VERBOSE[winner][2] == 0
    return winner, sufficient, SPLIT_ENTROPIES_VERBOSE

def recursive_split(ds: pd.DataFrame, by: str, attach_to: dict|None=None, ignore: list[str]=[], verbose=False):
    if attach_to is None:
        attach_to = {}
    w, ws, wv = dt_split(ds=ds,by=by,ignore=ignore,verbose=verbose)
    if ws:
        attach_to[w] = None
    else:
        attach_to[w] = {}
        s1,s2 = ds.groupby(w)
        recursive_split(ds=s1[1],by=by,attach_to=attach_to[w],ignore=[*ignore,w],verbose=verbose)
        recursive_split(ds=s2[1],by=by,attach_to=attach_to[w],ignore=[*ignore,w],verbose=verbose)
    return attach_to

tree = recursive_split(dataset,"Cat")
print(tree)