ID-3

In [20]:
import numpy as np
import pandas as pd 
df = pd.read_csv('data.csv')
df['Wind'] = df['Wind'].map({'Weak':0,'Strong':1})
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df = pd.get_dummies(df,columns=['Outlook'])

def convertTemp(val):
    if val >= df['Temp'].mean():
        return 1
    else:
        return 0

def convertHumidity(val):
    if val >= df['Humidity'].mean():
        return 1
    else:
        return 0

df['Temp'] = df['Temp'].apply(convertTemp)
df['Humidity'] = df['Humidity'].apply(convertHumidity)
df['Outlook_Overcast'] = df['Outlook_Overcast'].map({False:0,True:1})
df['Outlook_Rain'] = df['Outlook_Rain'].map({False:0,True:1})
df['Outlook_Sunny'] = df['Outlook_Sunny'].map({False:0,True:1})
x = df.drop('Decision',axis=1).values
y = df['Decision'].values


class TreeNode:
    def __init__(self,feature=None,entropy=None,samples=None,value = None,left=None,right=None):
        self.entropy = entropy
        self.feature = feature
        self.samples = samples
        self.value = value
        self.left = left
        self.right = right
    
    def is_leaf(self):
        return self.left is None and self.right is None

def compute_entropy(y):
    entropy = 0
    if len(y) > 0:
        p1 = len(y[y == 1])/len(y)
        if p1 not in (0,1):
            entropy = -p1*np.log2(p1) - (1-p1)*np.log2(1-p1)
    return entropy

def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_information_gain(x,y,node_indices,feature):
    left_indices,right_indices = split_dataset(x,node_indices,feature)
    x_node,y_node = x[node_indices],y[node_indices]
    x_left,y_left = x[left_indices],y[left_indices]
    x_right,y_right = x[right_indices],y[right_indices]
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    w_left = len(x_left)/len(x)
    w_right = len(x_right)/len(x)
    weighted_entropy = w_left*left_entropy + w_right*right_entropy
    info_gain =  node_entropy - weighted_entropy
    return info_gain

def get_best_feature_split(x,y,node_indices):
    num_features = x.shape[1]
    best_feature = None
    max_info_gain = -np.inf 
    for feature in range(num_features):
        info_gain = compute_information_gain(x,y,node_indices,feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
    return best_feature

#Algo for build tree 
'''1.find the node entropy
2.compute bincount of class label
3.create obj of tree class 
4. check for stop condition where entropy is 0 or curr == max_depth or only a unique class label left 
5.compute best feature split and check if best feature exists 
6.split dataset with best feature 
7.if there exists left_subtree update node.left with recursive call similarly apply to the right
8.return node'''

def build_id3_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,samples=len(node_indices),value=values)
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node
    best_feature = get_best_feature_split(x,y,node_indices)
    if best_feature is None:
        return node
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    node.feature = best_feature
    if left_indices is not None:
        node.left = build_id3_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices is not None:
        node.right = build_id3_tree(x,y,right_indices,current_depth+1,max_depth)
    return node

#Algo for print_tree 
'''1.check if feature names is not none and also node.feature is not none assign the feature name
2.check if we reached the leaf node print the depth(indent spacing) and class dist
3.else print the feature_name, node.entropy and node.samples
4.if left subtree is exists  print depth and recursively call print_tree function similarly do it for the right subtree '''


def print_tree(node, depth=0, feature_names=None):
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = f"Feature {node.feature}"

    if node.is_leaf():
        print(f"{'  ' * depth}Leaf node, class distribution: {node.value}")
    else:
        print(f"{'  ' * depth}{feature_name} (entropy = {node.entropy:.3f}), samples = {node.samples}")

    if node.left is not None:
        print(f"{'  ' * depth}Left:")
        print_tree(node.left, depth + 1, feature_names)
    if node.right is not None:
        print(f"{'  ' * depth}Right:")
        print_tree(node.right, depth + 1, feature_names)

def predict(root_node, sample):
    while not root_node.is_leaf():
        feature_index = root_node.feature
        if sample[feature_index] == 1:
            root_node = root_node.right
        else:
            return np.random.choice([0, 1])
    return np.argmax(root_node.value)


root_node = build_id3_tree(x,y,list(range(len(y))),current_depth=0,max_depth=6)
feature_names = df.drop('Decision',axis=1).columns.tolist()
print_tree(root_node,feature_names=feature_names)
sample_input = [1, 0, 1, 0, 1, 0]
y_pred = predict(root_node,sample_input)
print(f"Predicted class for input {sample_input}: {y_pred}")

Outlook_Overcast (entropy = 0.940), samples = 14
Left:
  Leaf node, class distribution: [0 4]
Right:
  Humidity (entropy = 1.000), samples = 10
  Left:
    Outlook_Rain (entropy = 0.811), samples = 4
    Left:
      Leaf node, class distribution: [0 1]
    Right:
      Leaf node, class distribution: [3 0]
  Right:
    Wind (entropy = 0.918), samples = 6
    Left:
      Temp (entropy = 0.918), samples = 3
      Left:
        Leaf node, class distribution: [0 1]
      Right:
        Leaf node, class distribution: [2 0]
    Right:
      Leaf node, class distribution: [0 3]
Predicted class for input [1, 0, 1, 0, 1, 0]: 1


In [32]:
import numpy as np
import pandas as pd 
df = pd.read_csv('data.csv')
df
df['Wind'] = df['Wind'].map({'Weak':0,'Strong':1})
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df = pd.get_dummies(df,columns=['Outlook'])

def convertTemp(val):
    if val >= df['Temp'].mean():
        return 1
    else:
        return 0

def convertHumidity(val):
    if val >= df['Humidity'].mean():
        return 1
    else:
        return 0

df['Temp'] = df['Temp'].apply(convertTemp)
df['Humidity'] = df['Humidity'].apply(convertHumidity)
df['Outlook_Overcast'] = df['Outlook_Overcast'].map({False:0,True:1})
df['Outlook_Rain'] = df['Outlook_Rain'].map({False:0,True:1})
df['Outlook_Sunny'] = df['Outlook_Sunny'].map({False:0,True:1})
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

class TreeNode:
    def __init__(self,feature=None,entropy=None,value=None,left=None,right=None,samples=None):
        self.entropy = entropy
        self.value = value
        self.feature = feature
        self.left = left
        self.right = right
        self.samples = samples

    def is_leaf(self):
        return self.left is None and self.right is None
    
def compute_entropy(y):
    entropy = 0
    if len(y) > 0:
        p1 = len(y[y == 1]) / len(y)
        if p1 not in (0, 1):
            entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
    return entropy


def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_information_gain(x,y,node_indices,feature):
    left_indices,right_indices = split_dataset(x,node_indices,feature)
    x_node,y_node = x[node_indices],y[node_indices]
    x_left,y_left = x[left_indices],y[left_indices]
    x_right,y_right = x[right_indices],y[right_indices]
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    w_left = len(x_left)/len(x)
    w_right = len(x_right)/len(x)
    weighted_entropy = w_left*left_entropy + w_right*right_entropy
    info_gain = node_entropy - weighted_entropy
    return info_gain

def get_best_feature_split(x,y,node_indices):
    num_features = x.shape[1]
    max_info_gain = -np.inf
    best_feature = None
    for feature in range(num_features):
        info_gain = compute_information_gain(x,y,node_indices,feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
    return best_feature



def build_id3_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    value = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,samples=len(node_indices),value=value)
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node 
    best_feature = get_best_feature_split(x,y,node_indices)
    if best_feature is None:
        return node
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices is not None:
        node.left = build_id3_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices is not None:
        node.right = build_id3_tree(x,y,right_indices,current_depth+1,max_depth)
    return node 

def print_tree(node,depth=0,feature_names=None):
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = f"Feature {node.feature}"
    
    if node.is_leaf():
        print(f"{' '  * depth} Leaf Node, Class distrubution:{node.value}")
    else:
        print(f"{' ' * depth}{feature_name} (entropy = {node.entropy:.3f}) , (samples = {node.samples})")
    
    if node.left is not None:
        print(f"{' '*depth}Left:")
        print_tree(node.left,depth+1,feature_names)
    if node.right is not None:
        print(f"{' '*depth}Right")
        print_tree(node.right,depth+1,feature_names)

root_node = build_id3_tree(x,y,list(range(len(y))),current_depth=0,max_depth=6)
feature_names = df.drop('Decision',axis=1).columns.tolist()
print_tree(root_node,feature_names=feature_names)


Feature None (entropy = 0.940) , (samples = 14)
Left:
  Leaf Node, Class distrubution:[0 4]
Right
 Feature None (entropy = 1.000) , (samples = 10)
 Left:
  Feature None (entropy = 0.811) , (samples = 4)
  Left:
    Leaf Node, Class distrubution:[0 1]
  Right
    Leaf Node, Class distrubution:[3 0]
 Right
  Feature None (entropy = 0.918) , (samples = 6)
  Left:
   Feature None (entropy = 0.918) , (samples = 3)
   Left:
     Leaf Node, Class distrubution:[0 1]
   Right
     Leaf Node, Class distrubution:[2 0]
  Right
    Leaf Node, Class distrubution:[0 3]


In [None]:
#Algo for build tree 
'''1.find the node entropy
2.compute bincount of class label
3.create obj of tree class 
4. check for stop condition where entropy is 0 or curr == max_depth or only a unique class label left 
5.compute best feature split and check if best feature exists 
6.split dataset with best feature 
7.if there exists left_subtree update node.left with recursive call similarly apply to the right
8.return node'''

def build_id3_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,samples=len(node_indices),value=values)
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node
    best_feature = get_best_feature_split(x,y,node_indices)
    if best_feature is None:
        return node
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices is not None:
        node.left = build_id3_tree(x,y,left_indices,current_depth + 1,max_depth)
    if right_indices is not None:
        node.right = build_id3_tree(x,y,right_indices,current_depth + 1,max_depth)
    return node


#Algo for print_tree 
'''1.check if feature names is not none and also node.feature is not none assign the feature name
2.check if we reached the leaf node print the depth(indent spacing) and class dist
3.else print the feature_name, node.entropy and node.samples
4.if left subtree is exists print depth and recursively call print_tree function similarly do it for the right subtree '''


def print_tree(node,depth=0,feature_names=None):
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = 
    if node.is_leaf():
        print(f"' '*{depth} Class disturbution: {node.value} ")
    else:
        print(f"' '*{depth}{feature_name} (entropy:{node.entropy:.3f}) (samples:{node.samples:.3f})")
    
    if node.left is not None:
        print(f"' '*{depth}Left:")
        print_tree(node.left,depth+1,feature_names)
    if node.right is not None:
        print(f"' '*{depth}Right:")
        print_tree(node.right,depth+1,feature_names)


def build_id3_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,samples=len(node_indices),value=values)
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node
    best_feature = get_best_feature_split(x,y,node_indices)
    if best_feature is None:
        return node 
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices is not None:
        node.left = build_id3_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices is not None:
        node.right = build_id3_tree(x,y,right_indices,current_depth+1,max_depth)
    return node

