# Exam Bonus Problem

## Problem 12.6

Implement Algorithm 4 for binary attributes and run it on a training set of your choice. You need to implement the actual learning algorithm on your own, you are not allowed to use libraries such as scikit-learn4 for that part.

In [52]:
class DecisionTreeNode:
    def __init__(self, attribute=None, data=None):
        self.left = None
        self.right = None
        self.attribute = attribute
        self.data = data

    def print(self):
        if self.left:
            self.left.print()
        print(self.data)
        if self.right:
            self.right.print()

    def inorder(self):
        if self.left:
            yield from self.left.inorder()
        yield self.data
        if self.right:
            yield from self.right.inorder()

    def add_branch(self, value, subtree):
        self.data = f"A={value}"
        # True is Right subbranch
        if value:
            self.right = subtree
        else:
            self.left = subtree

In [53]:
import math
import random


def entropy(examples):
    if not examples:
        return 0

    positive_count = sum(1 for a in examples if a[-1] is True)
    q = positive_count / len(examples)

    if q == 0 or q == 1:
        return 0

    return -(q * math.log(q, 2) + (1 - q) * math.log(1 - q, 2))

In [54]:
def plurality_val(examples):
    subset_true = [line for line in examples if line[-1]]
    subset_false = [line for line in examples if not line[-1]]

    if len(subset_false) > len(subset_true):
        return False
    elif len(subset_true) > len(subset_false):
        return True
    else:
        return random.choice([True, False])

In [55]:
def importance(attribute, examples):
    subset_false = [a for a in examples if not a[attribute]]
    subset_true = [a for a in examples if a[attribute]]

    return (
        entropy(examples)
        - len(subset_false) / len(examples) * entropy(subset_false)
        - len(subset_true) / len(examples) * entropy(subset_true)
    )

In [56]:
def argmax(attributes, examples, importance):
    max_importance = float("-inf")
    best_attribute = None
    for a in attributes:
        importance_a = importance(a, examples)
        if importance_a > max_importance:
            max_importance = importance_a
            best_attribute = a
    return best_attribute

In [57]:
def dt_learning(examples, attributes, parent_examples):
    if len(examples) == 0:
        return plurality_val(parent_examples)
    elif all(example[-1] == examples[0][-1] for example in examples):
        return examples[0][-1]
    elif len(attributes) == 0:
        return plurality_val(examples)
    else:
        A = argmax(attributes, examples, importance)
        tree = DecisionTreeNode(A)
        for value in [True, False]:
            new_examples = [e for e in examples if e[A] == value]
            subtree = dt_learning(new_examples, attributes - {A}, examples)
            tree.add_branch(value, subtree)
        return tree

### Test Dataset for decision tree learning

Format: [weather_sunny, weekend, holiday, go_outside]\
Attributes:\
    0 = Weather sunny\
    1 = Weekend\
    2 = Holiday\
Goal:\
    3 = Go_Outside



In [58]:
examples = [
    [True, True, False, True],  # Sunny, Weekend, Not Holiday -> Go Outside
    [True, False, False, False],  # Sunny, Weekday, Not Holiday -> Stay Inside
    [False, True, False, False],  # Rainy, Weekend, Not Holiday -> Stay Inside
    [False, False, False, False],  # Rainy, Weekday, Not Holiday -> Stay Inside
    [True, True, True, True],  # Sunny, Weekend, Holiday -> Go Outside
    [True, False, True, True],  # Sunny, Weekday, Holiday -> Go Outside
    [False, True, True, True],  # Rainy, Weekend, Holiday -> Go Outside
    [False, False, True, False],  # Rainy, Weekday, Holiday -> Stay Inside
    [True, True, False, True],  # Sunny, Weekend, Not Holiday -> Go Outside
]

# Attribute indices
attributes = {0, 1, 2}  # Weather_Sunny, Weekend, Holiday

columns = ["Weather_Sunny", "Weekend", "Holiday", "Go_Outside"]

print("Dataset:")
print(f"  {columns[0]}  |  {columns[1]}  |  {columns[2]}  |  {columns[3]}")
print("-" * 64)
for example in examples:
    weather = str(example[0]).ljust(len(columns[0]))
    weekend = str(example[1]).ljust(len(columns[1]))
    holiday = str(example[2]).ljust(len(columns[2]))
    go_out = str(example[3]).ljust(len(columns[3]))
    print(f"      {weather}      |     {weekend}     |    {holiday}   |   {go_out}")

Dataset:
  Weather_Sunny  |  Weekend  |  Holiday  |  Go_Outside
----------------------------------------------------------------
      True               |     True        |    False     |   True      
      True               |     False       |    False     |   False     
      False              |     True        |    False     |   False     
      False              |     False       |    False     |   False     
      True               |     True        |    True      |   True      
      True               |     False       |    True      |   True      
      False              |     True        |    True      |   True      
      False              |     False       |    True      |   False     
      True               |     True        |    False     |   True      


In [64]:
tree = dt_learning(examples, attributes, examples)

tree.print()

A=False
A=False


AttributeError: 'bool' object has no attribute 'print'