# The following code implements Algorithm 4 BuildRandomForest Page 37 of Gerald Friedland: "Information-Driven Machine Learning", Springer-Nature, 2023.

## https://link.springer.com/book/10.1007/978-3-031-39477-5

### The code is written by Neil Patel and released into public domain for demonstration purposes only, use at your own risk.  I appreciate a citation of this repository or the book, whatever fits best.

### The algorithm builds a random forest by creating numTrees decision trees. Each tree is built using a random subset of the samples with replacement, and a random subset of the features to consider at each split. For each tree, the algorithm uses the C4.5 algorithm (see Sect. 3.3.3) to select the best attribute to split on based on information gain. The tree is built recursively by splitting the data into subsets based on the selected attribute and its values. The process continues until a stopping criterion is met, such as when all samples belong to the same class or the maximum depth of the tree is reached. The algorithm returns a list of decision trees as the random forest

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Load dataset
data = pd.read_csv('car_evaluation.csv', header=None, names=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'])

In [5]:
# Display data
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [6]:
# Split features and target
X = data.drop('class', axis=1)
y = data['class']

In [7]:
# Define parameters
num_trees = 5
tree_depth = 3
num_features = 3


In [8]:
class DecisionNode:
    def __init__(self, attribute=None, results=None):
        self.attribute = attribute  # Attribute to split on
        self.results = results  # Results for a leaf node
        self.value = None  # Sub-tree for non-leaf node
        self.children = {}  # Dictionary to store child nodes

    def add_child(self, value, node):
        self.children[value] = node

def entropy(data):
    # Calculate entropy of a dataset
    results = data['class'].value_counts()
    entropy = 0.0
    for label in results.index:
        p = float(results[label]) / len(data)
        entropy -= p * np.log2(p)
    return entropy

def split_data(data, attribute, value):
    # Split dataset based on an attribute and its value
    left = data[data[attribute] == value]
    right = data[data[attribute] != value]
    return left, right

def information_gain(data, attribute):
    # Calculate information gain for a split on an attribute
    total_entropy = entropy(data)
    values = data[attribute].unique()
    weighted_entropy = 0.0
    for value in values:
        left, right = split_data(data, attribute, value)
        p = float(len(left)) / len(data)
        weighted_entropy += p * entropy(left)
    return total_entropy - weighted_entropy

def select_random_subset(attributes, num_features):
    # Select a random subset of features
    return np.random.choice(attributes, size=num_features, replace=False)

def build_decision_tree(X, y, tree_depth, num_features):
    if len(set(y)) == 1 or tree_depth == 0:
        return DecisionNode(results=y.iloc[0])

    best_split = None
    best_gain = 0.0
    sampled_attributes = select_random_subset(X.columns, num_features)

    for attribute in sampled_attributes:
        gain = information_gain(pd.concat([X, y], axis=1), attribute)
        if gain > best_gain:
            best_gain = gain
            best_split = attribute

    if best_split is None or best_gain == 0:
        return DecisionNode(results=y.value_counts().idxmax())

    tree = DecisionNode(attribute=best_split)

    values = X[best_split].unique()
    for value in values:
        subset_X, subset_y = X[X[best_split] == value], y[X[best_split] == value]
        if len(subset_y) > 0:
            subtree = build_decision_tree(subset_X, subset_y, tree_depth - 1, num_features)
            if subtree is not None:
                if subtree.results is not None:
                    tree.value = subtree
                else:
                    tree.add_child(value, subtree)
    if tree.value is None:
        return DecisionNode(results=y.value_counts().idxmax())

    return tree



def build_random_forest(data, labels, num_trees, tree_depth, num_features):
    forest = []
    for i in range(num_trees):
        subset_indices = np.random.choice(len(data), size=len(data), replace=True)
        subset_X, subset_y = data.iloc[subset_indices], labels.iloc[subset_indices]
        tree = build_decision_tree(subset_X, subset_y, tree_depth, num_features)
        forest.append(tree)
    return forest

In [9]:
# Build random forest
random_forest = build_random_forest(X, y, num_trees, tree_depth, num_features)


In [10]:
def print_tree(node, indent=""):
    if node.results is not None:
        print(indent + str(node.results))
    else:
        print(indent + str(node.attribute) + " =?")
        for value, child_node in node.children.items():
            print(indent + "--> " + str(value) + ":")
            print_tree(child_node, indent + "  ")

# Print each decision tree in the forest
for i, tree in enumerate(random_forest):
    print(f"Tree {i+1}:")
    print_tree(tree)
    print("\n")


Tree 1:
unacc


Tree 2:
safety =?
--> med:
  persons =?
  --> 4:
    buying =?
  --> more:
    buying =?


Tree 3:
safety =?
--> med:
  persons =?
  --> 4:
    buying =?
  --> more:
    buying =?


Tree 4:
unacc


Tree 5:
persons =?
--> more:
  safety =?
  --> med:
    lug_boot =?
  --> high:
    doors =?


