Simplified id3 

In [102]:
import pandas as pd
import numpy as np 

# Load the data from a CSV file
df = pd.read_csv('data.csv')

# One-hot encode the 'Outlook' feature and map other categorical features to integers
df = pd.get_dummies(df, columns=['Outlook'])
df['Decision'] = df['Decision'].map({'No': 0, 'Yes': 1})
df['Wind'] = df['Wind'].map({'Weak': 0, 'Strong': 1})

# Function to convert temperature to a binary feature
def convert_Temp(val):
    return 1 if val > 74 else 0
    
# Function to convert humidity to a binary feature
def convert_Humidity(val):
    return 1 if val > 80 else 0

# Apply the conversion functions
df['Humidity'] = df['Humidity'].apply(convert_Humidity)
df['Temp'] = df['Temp'].apply(convert_Temp)

# Prepare the feature matrix X and the target vector y
x = df.drop('Decision', axis=1).values
y = df['Decision'].values

# Function to compute entropy of a dataset
def compute_entropy(y):
    entropy = 0
    if len(y) > 0:  # Ensure the list is not empty
        p1 = np.sum(y == 1) / len(y)
        if p1 not in (0, 1):  # Avoid computing log(0)
            entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
    return entropy

# Function to split the dataset based on a feature
def split_dataset(x, node_indices, feature):
    left_indices = [i for i in node_indices if x[i][feature] == 1]
    right_indices = [i for i in node_indices if x[i][feature] == 0]
    return left_indices, right_indices

# Function to compute information gain of a split
def compute_information_gain(x, y, node_indices, feature):
    left_indices, right_indices = split_dataset(x, node_indices, feature)
    node_entropy = compute_entropy(y[node_indices])
    left_entropy = compute_entropy(y[left_indices])
    right_entropy = compute_entropy(y[right_indices])
    w_left = len(left_indices) / len(node_indices)
    w_right = len(right_indices) / len(node_indices)
    weighted_entropy = w_left * left_entropy + w_right * right_entropy
    information_gain = node_entropy - weighted_entropy
    return information_gain

# Function to find the best feature to split on
def get_best_split(x, y, node_indices):
    num_features = x.shape[1]
    best_feature = None
    max_info_gain = -np.inf
    for feature in range(num_features):
        info_gain = compute_information_gain(x, y, node_indices, feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
    return best_feature if max_info_gain > 0 else None

# Function to build the decision tree
def build_tree(x, y, node_indices, branch_name, max_depth, current_depth):
    if current_depth == max_depth or not node_indices:
        leaf_class = np.argmax(np.bincount(y[node_indices])) if node_indices else 'Undefined'
        print(f'{branch_name} Leaf node with indices: {node_indices}, Class: {leaf_class}')
        return
    best_feature = get_best_split(x, y, node_indices)
    if best_feature is None:
        leaf_class = np.argmax(np.bincount(y[node_indices]))
        print(f'{branch_name} Leaf node with indices: {node_indices}, Class: {leaf_class}')
        return
    left_indices, right_indices = split_dataset(x, node_indices, best_feature)
    print(f"Depth: {current_depth}, Branch: {branch_name}, Split on feature: {best_feature}")
    build_tree(x, y, left_indices, "Left", max_depth, current_depth + 1)
    build_tree(x, y, right_indices, "Right", max_depth, current_depth + 1)

# Initialize the root of the tree and start building
root_indices = list(range(len(y)))
build_tree(x, y, root_indices, "Root", max_depth=2, current_depth=0)


Depth: 0, Branch: Root, Split on feature: 3
Left Leaf node with indices: [2, 6, 11, 12], Class: 1
Depth: 1, Branch: Right, Split on feature: 1
Left Leaf node with indices: [0, 1, 3, 7], Class: 0
Right Leaf node with indices: [4, 5, 8, 9, 10, 13], Class: 1
