 id3 

In [4]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('data.csv')
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df['Wind'] =  df['Wind'].map({'Weak':0,'Strong':1})
df = pd.get_dummies(df,columns=['Outlook'])

def convertHum(val):
    if val > df['Humidity'].mean():
        return 1
    else:
        return 0 

def convertTemp(val):
    if val > df['Temp'].mean():
        return 1
    else:
        return 0 

df['Humidity'] = df['Humidity'].apply(convertHum)
df['Temp'] = df['Temp'].apply(convertTemp)
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

class TreeNode:
    def __init__(self,feature=None,entropy=None,samples=None,value=None,left=None,right=None) -> None:
        self.feature = feature
        self.entropy = entropy
        self.samples = samples
        self.value = value
        self.left = left
        self.right = right
    
    def is_leaf(self):
        return self.left is None and self.right is None
    
def compute_entropy(y):
    entropy = 0
    if len(y) > 0:
        p1 = len(y[y == 1])/len(y)
        if p1 not in (0,1):
            entropy = -p1*np.log2(p1) -(1-p1)*np.log2(1-p1)
    return entropy

def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_information_gain(x,y,node_indices,feature):
    left_indices,right_indices = split_dataset(x,node_indices,feature)
    x_node,y_node = x[node_indices],y[node_indices] 
    x_left,y_left = x[left_indices],y[left_indices]
    x_right,y_right = x[right_indices],y[right_indices]
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)  
    right_entropy = compute_entropy(y_right)
    w_left = len(x_left)/len(x)
    w_right = len(x_right)/len(x)
    weighted_entropy = w_left*left_entropy + w_right*right_entropy
    information_gain = node_entropy - weighted_entropy
    return information_gain

def get_best_split(x,y,node_indices):
    num_features = x.shape[1]
    max_info_gain = -np.inf
    best_feature = None
    for feature in range(num_features):
        info_gain = compute_information_gain(x,y,node_indices,feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain 
            best_feature = feature
    return best_feature

def build_id3_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    value = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,samples=len(node_indices),value=value)
    #Stop condition
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices]))==1:
        return node
    best_feature = get_best_split(x,y,node_indices)
    if best_feature is None:
        return node
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    node.feature = best_feature
    if left_indices:
        node.left = build_id3_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices:
        node.right = build_id3_tree(x,y,right_indices,current_depth+1,max_depth)
    return node

def print_tree(node, depth=0, feature_names=None):
    # Determine the feature name based on whether a list of feature names was provided.
    # If the node is not a leaf and feature names are provided, use the actual feature name.
    # Otherwise, use a generic feature label.
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = f"Feature {node.feature}"

    # Check if the current node is a leaf node.
    if node.is_leaf():
        # For leaf nodes, print the class distribution of the samples.
        print(f"{'  ' * depth}Leaf node, Class distribution: {node.value}")
    else:
        # For internal nodes, print the feature name, entropy, and number of samples.
        print(f"{'  ' * depth}{feature_name} (entropy={node.entropy:.3f}, samples={node.samples})")

        # If there is a left child, print the left branch of the tree.
        if node.left is not None:
            print(f"{'  ' * depth}Left:")
            print_tree(node.left, depth + 1, feature_names)

        # If there is a right child, print the right branch of the tree.
        if node.right is not None:
            print(f"{'  ' * depth}Right:")
            print_tree(node.right, depth + 1, feature_names)

# Correct function name for building the tree
root_node = build_id3_tree(x, y, list(range(len(y))), current_depth=0, max_depth=6)

# Print the tree
# Assuming df is defined elsewhere and is a pandas DataFrame
feature_names = df.drop('Decision', axis=1).columns.tolist()
print_tree(root_node, feature_names=feature_names)


def predict(tree, sample, feature_names):
    """
    Predict the class label for a single sample based on the decision tree.
    
    :param tree: The root node of the decision tree.
    :param sample: The feature vector for the sample to predict.
    :param feature_names: The list of feature names corresponding to the indices in the sample.
    :return: The predicted class label.
    """
    # Traverse the tree until a leaf node is reached
    while not tree.is_leaf():
        # If the feature on which to split is in our sample, use it; otherwise, predict randomly.
        if tree.feature is not None and feature_names[tree.feature] in sample:
            # Determine the index of the feature to split on
            feature_index = feature_names.index(tree.feature)
            # If the feature of the sample is 1, go right; otherwise, go left.
            if sample[feature_index] == 1:
                tree = tree.right
            else:
                tree = tree.left
        else:
            # If the feature is not in our sample, we have an issue (e.g., missing feature)
            # For simplicity, return a random class label (0 or 1).
            return np.random.choice([0, 1])
    
    # Once a leaf node is reached, return the class with the highest count
    return np.argmax(tree.value)

my_sample = {
    'Outlook_Overcast': 0,
    'Outlook_Rain': 1,
    'Outlook_Sunny': 0,
    'Temp': 0,  # Assuming this is a binary feature after preprocessing
    'Humidity': 1,  # Assuming this is a binary feature after preprocessing
    'Wind': 0  # Assuming this is a binary feature after preprocessing
}

# Convert the sample dictionary to a list in the order of feature names used in the tree
sample_features = [my_sample[fn] for fn in feature_names]

# Use the predict function to get the prediction for the sample
predicted_class = predict(root_node, sample_features, feature_names)
print(f"The predicted class for the sample is: {predicted_class}")


Outlook_Overcast (entropy=0.940, samples=14)
Left:
  Leaf node, Class distribution: [0 4]
Right:
  Humidity (entropy=1.000, samples=10)
  Left:
    Outlook_Rain (entropy=0.811, samples=4)
    Left:
      Leaf node, Class distribution: [0 1]
    Right:
      Leaf node, Class distribution: [3 0]
  Right:
    Wind (entropy=0.918, samples=6)
    Left:
      Temp (entropy=0.918, samples=3)
      Left:
        Leaf node, Class distribution: [0 1]
      Right:
        Leaf node, Class distribution: [2 0]
    Right:
      Leaf node, Class distribution: [0 3]
The predicted class for the sample is: 0


CART

In [5]:
import numpy as np
import pandas as pd

df = pd.read_csv('data.csv')
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df['Wind'] =  df['Wind'].map({'Weak':0,'Strong':1})
df = pd.get_dummies(df,columns=['Outlook'])

def convertHum(val):
    if val > df['Humidity'].mean():
        return 1
    else:
        return 0 

def convertTemp(val):
    if val > df['Temp'].mean():
        return 1
    else:
        return 0 

df['Humidity'] = df['Humidity'].apply(convertHum)
df['Temp'] = df['Temp'].apply(convertTemp)
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

class TreeNode:
    def __init__(self, feature=None, value=None, gini=None, samples=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.gini = gini
        self.samples = samples
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None

def compute_gini(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    gini = 1 - np.sum(probabilities ** 2)
    return gini

def split_dataset(x, node_indices, feature):
    left_indices = [i for i in node_indices if x[i, feature] == 0]
    right_indices = [i for i in node_indices if x[i, feature] == 1]
    return left_indices, right_indices

def compute_gini_gain(x, y, node_indices, feature):
    left_indices, right_indices = split_dataset(x, node_indices, feature)
    gini_node = compute_gini(y[node_indices])
    gini_left = compute_gini(y[left_indices])
    gini_right = compute_gini(y[right_indices])
    weight_left = len(left_indices) / len(node_indices)
    weight_right = len(right_indices) / len(node_indices)
    weighted_gini = weight_left * gini_left + weight_right * gini_right
    gini_gain = gini_node - weighted_gini
    return gini_gain

def get_best_split_cart(x, y, node_indices):
    best_feature = None
    best_gini_gain = -np.inf
    for feature in range(x.shape[1]):
        gini_gain = compute_gini_gain(x, y, node_indices, feature)
        if gini_gain > best_gini_gain:
            best_gini_gain = gini_gain
            best_feature = feature
    return best_feature, best_gini_gain

def build_cart_tree(x, y, node_indices, current_depth, max_depth):
    # Calculate the Gini impurity for the current node
    gini = compute_gini(y[node_indices])
    value = np.bincount(y[node_indices], minlength=2)
    node = TreeNode(gini=gini, samples=len(node_indices), value=value)
    
    # Check for max depth, pure node, or no more features to split on
    if current_depth == max_depth or gini == 0 or len(set(y[node_indices])) == 1:
        return node
    
    best_feature, best_gini_gain = get_best_split_cart(x, y, node_indices)
    if best_feature is None or best_gini_gain <= 0:
        return node
    
    left_indices, right_indices = split_dataset(x, node_indices, best_feature)
    node.feature = best_feature
    if left_indices:
        node.left = build_cart_tree(x, y, left_indices, current_depth + 1, max_depth)
    if right_indices:
        node.right = build_cart_tree(x, y, right_indices, current_depth + 1, max_depth)
    
    return node

# Assuming you have defined your x, y, and feature_names as before
root_node = build_cart_tree(x, y, list(range(len(y))), current_depth=0, max_depth=3)


def print_tree(node, depth=0, feature_names=None):
    indent = "  " * depth
    if node.is_leaf():
        print(f"{indent}Leaf node, class distribution: {node.value}")
    else:
        feature_name = feature_names[node.feature] if feature_names else f"Feature {node.feature}"
        print(f"{indent}{feature_name} (Gini={node.gini:.3f}, samples={node.samples})")
        if node.left is not None:
            print(f"{indent}Left:")
            print_tree(node.left, depth + 1, feature_names)
        if node.right is not None:
            print(f"{indent}Right:")
            print_tree(node.right, depth + 1, feature_names)

def predict_sample(node, sample):
    while not node.is_leaf():
        if sample[node.feature] == 0:
            node = node.left
        else:
            node = node.right
    return np.argmax(node.value)

def predict(tree, X):
    predictions = [predict_sample(tree, sample) for sample in X]
    return np.array(predictions)

# Define your instance as a list or a NumPy array (ensure the order of features matches the training data)
my_instance = [0, 1, 0, 0, 1, 0, 1, 0]  # Example instance

# Convert the instance to a NumPy array if it's not already one
my_instance = np.array(my_instance)

# Predict the class for your instance using the tree
prediction = predict_sample(root_node, my_instance)
print(f"The predicted class for the instance is: {prediction}")


The predicted class for the instance is: 1


C4.5

In [10]:
import numpy as np

class TreeNode:
    def __init__(self, feature=None, value=None, info_gain=None, samples=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.info_gain = info_gain
        self.samples = samples
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None

def compute_entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Add a small epsilon to avoid log(0)

def split_dataset(x, y, node_indices, feature):
    left_indices = [i for i in node_indices if x[i, feature] == 0]
    right_indices = [i for i in node_indices if x[i, feature] == 1]
    return left_indices, right_indices

def compute_gain_ratio(x, y, node_indices, feature):
    # Calculate information gain
    left_indices, right_indices = split_dataset(x, y, node_indices, feature)
    entropy_before = compute_entropy(y[node_indices])
    entropy_after = (
        len(left_indices) / len(node_indices) * compute_entropy(y[left_indices]) +
        len(right_indices) / len(node_indices) * compute_entropy(y[right_indices])
    )
    info_gain = entropy_before - entropy_after

    # Calculate split information
    split_info = 0
    for indices in (left_indices, right_indices):
        proportion = len(indices) / len(node_indices)
        split_info -= proportion * np.log2(proportion + 1e-9)  # Add a small epsilon to avoid log(0)

    # Calculate gain ratio
    gain_ratio = info_gain / split_info if split_info != 0 else 0
    return gain_ratio

def get_best_split_c45(x, y, node_indices):
    best_feature = None
    best_gain_ratio = -np.inf
    for feature in range(x.shape[1]):
        gain_ratio = compute_gain_ratio(x, y, node_indices, feature)
        if gain_ratio > best_gain_ratio:
            best_gain_ratio = gain_ratio
            best_feature = feature
    return best_feature, best_gain_ratio

def build_c45_tree(x, y, node_indices, current_depth, max_depth):
    entropy = compute_entropy(y[node_indices])
    value = np.bincount(y[node_indices], minlength=2)
    node = TreeNode(info_gain=entropy, samples=len(node_indices), value=value)
    
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node
    
    best_feature, best_gain_ratio = get_best_split_c45(x, y, node_indices)
    if best_feature is None or best_gain_ratio <= 0:
        return node
    
    left_indices, right_indices = split_dataset(x, y, node_indices, best_feature)
    node.feature = best_feature
    if left_indices:
        node.left = build_c45_tree(x, y, left_indices, current_depth + 1, max_depth)
    if right_indices:
        node.right = build_c45_tree(x, y, right_indices, current_depth + 1, max_depth)
    
    return node

# Assuming you have defined your x, y, and feature_names as before
root_node_c45 = build_c45_tree(x, y, list(range(len(y))), current_depth=0, max_depth=3)

def print_tree(node, depth=0, feature_names=None):
    # Determine the feature name based on whether a list of feature names was provided.
    # If the node is not a leaf and feature names are provided, use the actual feature name.
    # Otherwise, use a generic feature label.
    feature_name = feature_names[node.feature] if (feature_names is not None and node.feature is not None) else "N/A"

    # Check if the current node is a leaf node.
    if node.is_leaf():
        # For leaf nodes, print the class distribution of the samples.
        print(f"{'  ' * depth}Leaf node, Class distribution: {node.value}")
    else:
        # For internal nodes, print the feature name, gain ratio, and number of samples.
        print(f"{'  ' * depth}{feature_name} (info_gain={node.info_gain:.3f}, samples={node.samples})")

        # If there is a left child, print the left branch of the tree.
        if node.left is not None:
            print(f"{'  ' * depth}Left:")
            print_tree(node.left, depth + 1, feature_names)

        # If there is a right child, print the right branch of the tree.
        if node.right is not None:
            print(f"{'  ' * depth}Right:")
            print_tree(node.right, depth + 1, feature_names)

def predict(instance, node):
    # Traverse the tree based on the features in the instance until a leaf node is reached.
    while not node.is_leaf():
        # Use the feature index directly as it's stored in the tree.
        feature_index = node.feature
        
        # Traverse to the left or right child node based on the instance's feature value.
        if instance[feature_index] == 0:
            node = node.left
        else:
            node = node.right

        # If the tree is not well-formed or the instance is missing features, return None or a default value.
        if node is None:
            return None

    # Return the most common class label from the leaf node.
    return np.argmax(node.value)

# Define your instance as before
instance = np.array([1, 0, 0, 1, 1, 1])  # Example instance

# Call the predict function with this instance and the trained tree root node.
prediction = predict(instance, root_node_c45)  # Make sure you use the correct root node for the C4.5 tree.

# Output the prediction
print(f"The predicted class for the instance is: {prediction}")




The predicted class for the instance is: 1
