In [34]:
# Imports for building and visualising Decision Trees

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [103]:
# Visualisation


def visualise_decision_tree(tree, tree_type):

    # Use the NetworkX charting package
    G = nx.Graph()
    pos = {}
    labels = {}
    
    # Recursively add nodes to the graph
    def add_nodes(node, x=0, y=0, parent=None, level=0):
        node_id = str(id(node))
        
        # Set the label for the node
        label = "Qual" if tree_type == "cart" else "Play"
        labels[node_id] = (f"{label}: {node.value}" if node.value is not None else f"{node.feature}" + (f"\n≤ {node.threshold:.2f}" if tree_type == "cart" else ""))
        
        # Add the node to the graph and set the position
        G.add_node(node_id)
        pos[node_id] = (x, y)
        if parent:
            G.add_edge(parent, node_id)
        
        # Calculate the position of the children
        spacing = 2 ** (3 - level)
        child_y = y - 1.5
        
        # Recursively add the children

        if tree_type == "id3" and node.branches:
            width = len(node.branches)
            for i, (_, child) in enumerate(node.branches.items()):
                child_x = x - (spacing * width/2) + i * spacing * 2
                add_nodes(child, child_x, child_y, node_id, level+1)
                
        if tree_type == "cart" and (node.left or node.right):
            if node.left:
                add_nodes(node.left, x - spacing, child_y, node_id, level+1)
            if node.right:
                add_nodes(node.right, x + spacing, child_y, node_id, level+1)
                
        # Return the node ID
        return node_id
    

    if tree_type == "id3":
        plt.figure(figsize=(8, 4))
    if tree_type == "cart":
        plt.figure(figsize=(25, 14))

    add_nodes(tree.root)
    nx.draw(G, pos, labels=labels, with_labels=True, node_size=3000, font_size=7, font_weight='bold')
    plt.show()



def plot_importance(data):
    plt.figure(figsize=(12, 4))
    plt.xticks(rotation=45, ha='right')
    plt.bar(data.keys(), data.values())
    plt.title('Feature Importance in Wine Quality Prediction')
    plt.ylabel('Usage Count in Decision Tree')
    plt.show()


#### Implementation Approach

This implementation provides two complementary approaches: ID3 and CART. This dual implementation strategy was chosen specifically to handle a wider range of real-world datasets while maintaining high performance across different data types.

#### ID3 Implementation Overview

The ID3 implementation takes a top-down, recursive approach to handling categorical data using entropy and information gain for split decisions. This version of ID3 enhances the classic algorithm by adding key optimisations: while the basic algorithm looks only at entropy to split data, a configurable maximum depth prevents excessive tree growth and potential overfitting. The implementation includes majority class prediction to handle real-world data challenges like missing values and edge cases. By calculating information gain at each decision point, the algorithm selects the most informative features for splitting. These enhancements allow the implementation to effectively process data with multiple categorical attributes, handle missing information, and work with various classification problems.

In [45]:
# ID3 Decision Tree

class ID3Node:
    def __init__(self, feature=None, branches=None, value=None):

        # The feature to split on at this node
        self.feature = feature 

        # Subtrees for each feature value
        self.branches = branches or {}  

        # Prediction stored at leaf nodes
        self.value = value 


class DecisionTreeID3:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def entropy(self, labels):
        # Calculates the Shannon entropy of the target labels
        # Entropy = -Σ(p(x) * log2(p(x))) where p(x) is probability of each class
        # Lower entropy means more homogeneous labels where higher entropy means more mixed labels

        # Get probability of each class
        probs = labels.value_counts(normalize=True)  

        # Apply the shannon entropy formula
        return -sum(probs * np.log2(probs)) 


    def information_gain(self, labels, labels_split):
        # Calculates the information gain from a potential split
        # Gain = Entropy(parent) - Σ((n_child/n_parent) * Entropy(child))
        # The higher gain means a better split where more information is obtained

        parent_entropy = self.entropy(labels)
        
        # Calculate the weighted average entropy of children
        weighted_child_entropy = sum((len(child) / len(labels)) * self.entropy(child) for child in labels_split)

        # Take weighted entropy from parent entropy to get information gain
        return parent_entropy - weighted_child_entropy


    def find_best_feature(self, features, labels):
        # For each feature, group data by feature values then calculate information gain and then choose feature with highest gain
        # Returns the feature with the highest information gain and the corresponding label subsets

        best_feature = None
        best_splits = None

        # Any gain will be better than -1
        best_gain = -1  

        for feature in features.columns:

            # Group labels by feature values
            splits = {value: labels[features[feature] == value] for value in features[feature].unique()}
            labels_split = list(splits.values())
            
            # Calculate information gain and update best feature if better
            gain = self.information_gain(labels, labels_split)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_splits = splits

        # Return the feature with the highest gain and the corresponding splits
        return best_feature, best_splits


    def build_tree(self, features, labels, depth=0):

        # Recursively build the decision tree and create a leaf node with the target value
        if len(set(labels)) == 1:  
            return ID3Node(value=labels.iloc[0]) 
        
         # If no features left or max depth reached, return majority class
        if not features.columns.size or (self.max_depth and depth >= self.max_depth):
            return ID3Node(value=labels.value_counts().idxmax())  

        # Find the best feature and splits for this level. If no meaningful split is found, return majority class
        best_feature, best_splits = self.find_best_feature(features, labels)
        if not best_feature:  
            return ID3Node(value=labels.value_counts().idxmax())

        # Create a decision node with branches for each unique value of the best feature
        branches = {}
        for value, subset in best_splits.items():  
            subset_features = features[features[best_feature] == value].drop(columns=[best_feature]) 
            branches[value] = self.build_tree(subset_features, subset, depth + 1)  

        # Return the decision node
        return ID3Node(feature=best_feature, branches=branches)  


    def fit(self, features, labels):
        # Train the decision tree by building it from the data, starting from the root
        self.root = self.build_tree(features, labels)  


    def predict_single(self, node, feature):
        # Predict the label for a single instance by following the tree, if it's a leaf node, return the stored value
        if node.value is not None:  
            return node.value
        
        # Get the value of the feature at this node and check if there's a branch for this value
        feature_value = feature[node.feature]  
        if feature_value in node.branches:  
            return self.predict_single(node.branches[feature_value], feature) 
        
        # If no branch matche
        return None  


    def predict(self, features):
        # Predict the labels for multiple instances
        return [self.predict_single(self.root, feature) for _, feature in features.iterrows()] 

In [None]:
# Run ID3 DT - Weather Data

data = pd.read_csv('weather-data.csv')

features = data.drop(columns=['Decision', 'Day']) 
labels = data['Decision']

id3_tree = DecisionTreeID3(max_depth=3)
id3_tree.fit(features, labels)

predictions = id3_tree.predict(features)
accuracy = sum(pred == actual for pred, actual in zip(predictions, labels)) / len(labels)
print(f"Accuracy: {accuracy:.2%}")

visualise_decision_tree(id3_tree, tree_type="id3")

#### Performance Analysis on Weather Dataset

The ID3 implementation achieved a 100% accuracy on the weather dataset. With only 14 rows of data, this perfect accuracy indicates an overfitting behavior. The decision tree has effectively created a lookup table for the training data.

Each decision node can split the already small dataset into even smaller subsets, eventually leading to leaf nodes that may represent just one or two samples. This level of granularity, while producing perfect training accuracy, typically indicates that the model has learned patterns specific to this exact dataset rather than general relationships between weather conditions and decisions.

#### ----------------------------------------------------------------------

#### Wine Quality Dataset
This implementation uses the "Wine Quality" dataset from the UCI Machine Learning Repository, accessed via Kaggle:

**Dataset Source:**  
- Title: Red Wine Quality (Cortez et al., 2009)
- URL: https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009
- Original Publication: P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

**Dataset Characteristics:**
- Features:
  - Fixed acidity
  - Volatile acidity
  - Citric acid
  - Residual sugar
  - Chlorides
  - Free sulfur dioxide
  - Total sulfur dioxide
  - Density
  - pH
  - Sulphates
  - Alcohol
- Output Variable: Quality (score between 0 and 10)
- Data Count: 1,599
- Feature Types: All numeric

#### CART Implementation Overview

The CART implementation builds on ID3's capabilities by handling both numerical and categorical data through binary splits. While ID3 creates multiple branches at each node, CART makes binary decisions based on optimised thresholds - a design choice that proved effective with the wine quality dataset's numerical features. The implementation uses Gini impurity to evaluate potential splits, which tends to work well for numerical data. It includes practical safeguards like minimum sample requirements for splits and configurable tree depth to prevent overfitting. Through efficient midpoint threshold calculations, the algorithm processes both continuous and categorical variables effectively, making it versatile enough for complex, real-world datasets.

In [47]:
# CART Decision Tree

class CARTNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        # The feature to split on at this node
        self.feature = feature      

        # Threshold for binary split
        self.threshold = threshold  

        # Subtrees for each branch
        self.left = left           
        self.right = right        

        # Prediction stored at leaf nodes 
        self.value = value         


class CARTDecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        self.feature_names = None
        self.feature_importance = {}

    def gini(self, labels):
        # Calculate the Gini impurity for a set of labels
        # Gini = 1 - Σ(p_i²) where p_i is probability of class i
        # Gini = 0 - (Perfect purity - all samples same class)
        # Gini aprox 0.5 - (Maximum impurity for binary classification)

        if len(labels) == 0:
            return 0
        
        # Get the counts and probabilities of each class
        counts = np.bincount(labels)
        proportions = counts / len(labels)

        # Calculate the Gini impurity
        return 1 - np.sum(proportions ** 2)
    

    def find_best_split(self, features, labels, feature):
        # Find best binary split for a numerical feature and return the threshold and Gini impurity by trying midpoints between consecutive values

        # Initialise best Gini impurity and threshold
        best_gini = float('inf')
        best_threshold = None
        
        feature_values = sorted(features[feature].unique())
        
        # Use midpoints between consecutive values as potential thresholds
        thresholds = [(feature_values[i] + feature_values[i+1])/2 for i in range(len(feature_values)-1)]
        
        # Try each threshold and keep track of the best one
        for threshold in thresholds:
            left_mask = features[feature] <= threshold
            right_mask = ~left_mask
            
            # Ensure minimum samples in both children
            if (np.sum(left_mask) < self.min_samples_split or 
                np.sum(right_mask) < self.min_samples_split):
                continue
            
            # Calculate Gini impurity for both children
            left_gini = self.gini(labels[left_mask])
            right_gini = self.gini(labels[right_mask])
            
            # Weighted average of children's Gini impurity
            n_left = np.sum(left_mask)
            n_right = np.sum(right_mask)
            weighted_gini = (n_left * left_gini + n_right * right_gini) / len(labels)
            
            # Update best Gini impurity and threshold
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_threshold = threshold
                    
        return best_threshold, best_gini
    

    def find_best_feature(self, features, labels):
        # Find the feature that provides the best split and return the feature and threshold

        best_feature = None
        best_threshold = None
        best_gini = float('inf')
        
        # Try each feature and find the best threshold
        for feature in features.columns:
            threshold, gini = self.find_best_split(features, labels, feature)
            if threshold is not None and gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = threshold

                # Update feature importance
                if best_feature is not None:
                    if best_feature not in self.feature_importance:
                        self.feature_importance[best_feature] = 1
                    else:
                        self.feature_importance[best_feature] += 1
                
        return best_feature, best_threshold
    

    def build_tree(self, features, labels, depth=0):

        # Recursively build the decision tree and create a leaf node with the target value
        if len(set(labels)) == 1:
            return CARTNode(value=labels.iloc[0])
        
        # If no features left or max depth reached, return majority class
        if not features.columns.size or (self.max_depth and depth >= self.max_depth):
            return CARTNode(value=labels.value_counts().idxmax())
            
        # Find the best feature and threshold for this level. If no meaningful split is found, return majority class
        best_feature, best_threshold = self.find_best_feature(features, labels)
        
        # If no split found, return majority class
        if best_feature is None:
            return CARTNode(value=labels.value_counts().idxmax())
            
        # Split data
        left_mask = features[best_feature] <= best_threshold
        right_mask = ~left_mask
        
        # Ensure minimum samples in both children
        left_subtree = self.build_tree(features[left_mask], labels[left_mask], depth + 1)
        right_subtree = self.build_tree(features[right_mask], labels[right_mask], depth + 1)
        
        # Return the decision node
        return CARTNode(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
    

    def fit(self, features, labels):
        # Train the decision tree by building it from the data, starting from the root
        self.feature_names = features.columns  
        self.root = self.build_tree(features, labels)
        

    def predict_single(self, node, feature):

        # If leaf node, return the stored value
        if node.value is not None:
            return node.value
            
        # Get the value of the feature at this node and check if there's a branch for this value
        if feature[node.feature] <= node.threshold:
            return self.predict_single(node.left, feature)
        
        # If no branch matches
        return self.predict_single(node.right, feature)
    

    def predict(self, features):
        # Predict the labels for multiple instances
        return [self.predict_single(self.root, feature) for _, feature in features.iterrows()]

In [48]:
# Imports and logic for data handling of the Red Wine Quality dataset

import os
import kagglehub

# Function to download and move dataset to working directory
import shutil


def get_and_move_dataset():

    # Download dataset from Kaggle
    cache_path = kagglehub.dataset_download("uciml/red-wine-quality-cortez-et-al-2009")
    working_dir = os.getcwd()
    
    # Move dataset to working directory
    for file in os.listdir(cache_path):
        src = os.path.join(cache_path, file)
        dst = os.path.join(working_dir, file)
        shutil.copy2(src, dst)
    return working_dir

In [None]:
# Run CART DT - Wine Quality Data

get_and_move_dataset()

data = pd.read_csv('winequality-red.csv')

features = data.drop('quality', axis=1)
labels = data['quality']

cart_tree = CARTDecisionTree(max_depth=5, min_samples_split=50) 
cart_tree.fit(features, labels)

predictions = cart_tree.predict(features)
accuracy = sum(pred == actual for pred, actual in zip(predictions, labels)) / len(labels)
print(f"Accuracy: {accuracy:.2%}")
print("\n")

visualise_decision_tree(cart_tree, tree_type="cart")
print("\n")

feature_importance = dict(sorted(cart_tree.feature_importance.items(), key=lambda x: x[1], reverse=True))
plot_importance(feature_importance)