ID3

In [12]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('data.csv')
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df['Wind'] = df['Wind'].map({'Weak':0,'Strong':1})
df = pd.get_dummies(df,columns=['Outlook'])
df['Outlook_Overcast'] = df['Outlook_Overcast'].map({False:0,True:1})
df['Outlook_Sunny'] = df['Outlook_Sunny'].map({False:0,True:1})
df['Outlook_Rain'] = df['Outlook_Rain'].map({False:0,True:1})

def convertHum(val):
    if val > df['Humidity'].mean():
        return 1 
    else:
        return 0
    
def convertTemp(val):
    if val > df['Temp'].mean():
        return 1 
    else:
        return 0
df['Humidity'] = df['Humidity'].apply(convertHum)
df['Temp'] = df['Temp'].apply(convertTemp)
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

def compute_entropy(y):
    entropy = 0
    if len(y) > 0:
        p1 = len(y[y == 1])/len(y)
        if p1 not in (0,1):
            entropy = -p1*np.log2(p1) -(1-p1)*np.log2(1-p1)
    return entropy 

def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_information_gain(x,y,node_indices,feature):
    left_indices,right_indices = split_dataset(x,node_indices,feature)
    x_node,y_node = x[node_indices],y[node_indices]
    x_left,y_left = x[left_indices],y[left_indices]
    x_right,y_right = x[right_indices],y[right_indices]
    w_left,w_right = len(x_left)/len(x),len(x_right)/len(x)
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    weighted_entropy = w_left*left_entropy + w_right*right_entropy
    information_gain = node_entropy - weighted_entropy
    return information_gain

def get_best_split(x,y,node_indices):
    num_features = x.shape[1]
    best_feature = None
    max_info_gain = -np.inf
    for feature in range(num_features):
        info_gain = compute_information_gain(x,y,node_indices,feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature 
    return best_feature

class TreeNode:
    def __init__(self,entropy=None,features=None,values=None,sample=None,left=None,right=None):
        self.entropy = entropy 
        self.feature = features 
        self.values = values 
        self.left = left 
        self.right = right
        self.sample = sample
    
    def is_leaf(self):
        return self.right is None and self.left is None 
    
def build_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,values=values,sample=len(node_indices))
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node 
    best_feature = get_best_split(x,y,node_indices)
    if best_feature is None:
        return node 
    node.feature = best_feature
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices:
        node.left = build_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices:
        node.right = build_tree(x,y,right_indices,current_depth+1,max_depth)
    return node

def print_tree(node,feature_names=None,depth=0):
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = f'Feature {node.feature}'
    if node.is_leaf():
        print(f"{' '*depth} Leaf Node,Class Distrubution:{node.values}")
    else:    
        print(f"{' '*depth}{feature_name} (entropy={node.entropy}) (samples={node.sample}) ")
    if node.left:
        print(f"{' '*depth}Left")
        print_tree(node.left,feature_names,depth+1)
    if node.right:
        print(f"{' '*depth}Right")
        print_tree(node.right,feature_names,depth+1)

def predict(node,sample_input):
    while not node.is_leaf():
        feature_index = node.feature
        if sample_input[feature_index] == 1:
            node = node.right
        else:
            node = node.left
    return np.argmax(node.values)
 

node = build_tree(x,y,list(range(len(y))),0,6)
feature_names = df.drop('Decision',axis=1).columns.to_list()
print_tree(node,feature_names,0)
sample = [1,1,0,0,1,0]
pred = predict(node,sample)
print(f"Predicition:{pred}")

Outlook_Overcast (entropy=0.9402859586706311) (samples=14) 
Left
  Leaf Node,Class Distrubution:[0 4]
Right
 Humidity (entropy=1.0) (samples=10) 
 Left
  Outlook_Rain (entropy=0.8112781244591328) (samples=4) 
  Left
    Leaf Node,Class Distrubution:[0 1]
  Right
    Leaf Node,Class Distrubution:[3 0]
 Right
  Wind (entropy=0.9182958340544896) (samples=6) 
  Left
   Temp (entropy=0.9182958340544896) (samples=3) 
   Left
     Leaf Node,Class Distrubution:[0 1]
   Right
     Leaf Node,Class Distrubution:[2 0]
  Right
    Leaf Node,Class Distrubution:[0 3]
Predicition:1


C4.5

In [19]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('data.csv')
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df['Wind'] = df['Wind'].map({'Weak':0,'Strong':1})
df = pd.get_dummies(df,columns=['Outlook'])
df['Outlook_Overcast'] = df['Outlook_Overcast'].map({False:0,True:1})
df['Outlook_Sunny'] = df['Outlook_Sunny'].map({False:0,True:1})
df['Outlook_Rain'] = df['Outlook_Rain'].map({False:0,True:1})

def convertHum(val):
    if val > df['Humidity'].mean():
        return 1 
    else:
        return 0
    
def convertTemp(val):
    if val > df['Temp'].mean():
        return 1 
    else:
        return 0
df['Humidity'] = df['Humidity'].apply(convertHum)
df['Temp'] = df['Temp'].apply(convertTemp)
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

def compute_entropy(y):
    entropy = 0
    if len(y) > 0:
        p1 = len(y[y == 1])/len(y)
        if  p1 not in (0,1):
            entropy = -p1*np.log2(p1) -(1-p1)*np.log2(1-p1)
    return entropy 

def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_gain_ratio(x, y, node_indices, feature):
    left_indices, right_indices = split_dataset(x, node_indices, feature)
    x_node, y_node = x[node_indices], y[node_indices]
    x_left, y_left = x[left_indices], y[left_indices]
    x_right, y_right = x[right_indices], y[right_indices]
    w_left = len(x_left) / len(x_node)
    w_right = len(x_right) / len(x_node)
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    weighted_entropy = w_left * left_entropy + w_right * right_entropy
    info_gain = node_entropy - weighted_entropy
    split_info = 0
    for indices in (left_indices, right_indices):
        proportion = len(indices) / len(x_node)
        if proportion > 0:
            split_info -= proportion * np.log2(proportion)
    if split_info != 0:
        gain_ratio = info_gain / split_info
    else:
        gain_ratio = 0
    return gain_ratio

def get_best_split(x,y,node_indices):
    best_feature = None 
    max_gain_ratio = -np.inf
    num_features = x.shape[1]
    for feature in range(num_features):
        gain_ratio = compute_gain_ratio(x,y,node_indices,feature)
        if gain_ratio > max_gain_ratio:
            max_gain_ratio = gain_ratio
            best_feature = feature
    return best_feature


class TreeNode:
    def __init__(self,entropy=None,feature=None,samples=None,values=None,left=None,right=None) -> None:
        self.entropy = entropy
        self.samples = samples
        self.left = left
        self.right = right
        self.values = values
        self.feature = feature
    
    def is_leaf(self):
        return self.right is None and self.left is None  

def build_tree(x,y,node_indices,current_depth,max_depth):
    entropy = compute_entropy(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(entropy=entropy,values=values,samples=len(node_indices))
    if current_depth == max_depth or entropy == 0 or len(np.unique(y[node_indices])) == 1:
        return node 
    best_feature = get_best_split(x,y,node_indices)
    if best_feature is None:
        return node
    node.feature = best_feature
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices:
        node.left = build_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices:
        node.right = build_tree(x,y,right_indices,current_depth+1,max_depth)
    return node 

def print_tree(node,feature_names=None,depth=None):
    if feature_names is not None and node.feature is not None:
        feature_name = feature_names[node.feature]
    else:
        feature_name = f"Feature {node.feature}"
    if node.is_leaf():
        print(f"{' '*depth} leaf node,class distrubution:{node.values}")
    else:
        print(f"{' '*depth}{feature_name} (entropy = {node.entropy}) (sample = {node.samples})")
    if node.left:
        print(f"{' '*depth}Left:")
        print_tree(node.left,feature_names,depth+1)
    if node.right:
        print(f"{' '*depth}Right:")
        print_tree(node.right,feature_names,depth+1)

def predict(node,sample_input):
    while not node.is_leaf():
        feature_index = node.feature
        if sample_input[feature_index] == 1:
            node = node.right
        else:
            node = node.left 
    return np.argmax(node.values)


node = build_tree(x,y,list(range(len(y))),0,6)
feature_names = df.drop('Decision',axis=1).columns.to_list()
print_tree(node,feature_names,0)
sample = [1,1,0,0,1,0]
pred = predict(node,sample)
print(f"Predicition:{pred}")

Outlook_Overcast (entropy = 0.9402859586706311) (sample = 14)
Left:
  leaf node,class distrubution:[0 4]
Right:
 Humidity (entropy = 1.0) (sample = 10)
 Left:
  Outlook_Rain (entropy = 0.8112781244591328) (sample = 4)
  Left:
    leaf node,class distrubution:[0 1]
  Right:
    leaf node,class distrubution:[3 0]
 Right:
  Wind (entropy = 0.9182958340544896) (sample = 6)
  Left:
   Temp (entropy = 0.9182958340544896) (sample = 3)
   Left:
     leaf node,class distrubution:[0 1]
   Right:
     leaf node,class distrubution:[2 0]
  Right:
    leaf node,class distrubution:[0 3]
Predicition:1


CART

In [26]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('data.csv')
df['Decision'] = df['Decision'].map({'No':0,'Yes':1})
df['Wind'] = df['Wind'].map({'Weak':0,'Strong':1})
df = pd.get_dummies(df,columns=['Outlook'])
df['Outlook_Overcast'] = df['Outlook_Overcast'].map({False:0,True:1})
df['Outlook_Sunny'] = df['Outlook_Sunny'].map({False:0,True:1})
df['Outlook_Rain'] = df['Outlook_Rain'].map({False:0,True:1})

def convertHum(val):
    if val > df['Humidity'].mean():
        return 1 
    else:
        return 0
    
def convertTemp(val):
    if val > df['Temp'].mean():
        return 1 
    else:
        return 0
df['Humidity'] = df['Humidity'].apply(convertHum)
df['Temp'] = df['Temp'].apply(convertTemp)
x = df.drop('Decision',axis=1).values
y = df['Decision'].values

def compute_gini(y):
    gini = 0
    if len(y) > 0:
        p1 = len(y[y==1])/len(y)
        if p1 not in (0,1):
            gini = 1 - (p1**2 + (1-p1)**2)
    return gini

def split_dataset(x,node_indices,feature):
    left_indices,right_indices = [],[]
    for i in node_indices:
        if x[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices,right_indices

def compute_weighted_gini(x,y,node_indices,feature):
    left_indices,right_indices = split_dataset(x,node_indices,feature)
    x_node,y_node = x[node_indices],y[node_indices]
    x_left,y_left = x[left_indices],y[left_indices]
    x_right,y_right = x[right_indices],y[right_indices]
    w_left,w_right = len(x_left)/len(x),len(x_right)/len(x)
    left_gini = compute_gini(y_left)
    right_gini = compute_gini(y_right)
    weighted_gini = w_left*left_gini + w_right*right_gini
    return weighted_gini

def get_best_split(x,y,node_indices):
    num_features = x.shape[1]
    best_feature = None  
    low_weighted_gini = np.inf
    for feature in range(num_features):
        weighted_gini = compute_weighted_gini(x,y,node_indices,feature)
        if weighted_gini < low_weighted_gini:
            low_weighted_gini = weighted_gini
            best_feature = feature
    return best_feature

class TreeNode:
    def __init__(self,gini=None,features=None,values=None,samples=None,left=None,right=None):
        self.gini = gini
        self.features = features
        self.values = values
        self.left = left
        self.right = right
        self.samples = samples

    def is_leaf(self):
        return self.right is None and self.left is None
    
def build_tree(x,y,node_indices,current_depth,max_depth):
    gini = compute_gini(y[node_indices])
    values = np.bincount(y[node_indices],minlength=2)
    node = TreeNode(gini=gini,samples=len(node_indices),values=values)
    if current_depth == max_depth or gini == 0 or len(np.unique(y[node_indices])) == 1:
        return node 
    best_feature = get_best_split(x,y,node_indices)
    if best_feature is None:
        return node 
    node.features = best_feature
    left_indices,right_indices = split_dataset(x,node_indices,best_feature)
    if left_indices is not None:
        node.left = build_tree(x,y,left_indices,current_depth+1,max_depth)
    if right_indices is not None:
        node.right = build_tree(x,y,right_indices,current_depth+1,max_depth)
    return node 

def print_tree(node,feature_names=None,depth=0):
    if feature_names is not None and node.features is not None:
        feature_name = feature_names[node.features]
    else:
        feature_name = f"Feature {node.features}"
    if node.is_leaf():
        print(f"{' '*depth}Leaf node,Class distrubution:{node.values}")
    else:
        print(f"{' '*depth}{feature_name} (gini = {node.gini}) (samples = {node.samples})")
    if node.left:
        print(f"{' '*depth}Left:")
        print_tree(node.left,feature_names,depth+1)
    if node.right: 
        print(f"{' '*depth}Left:")
        print_tree(node.right,feature_names,depth+1)

def predict(node,sample_input):
    while not node.is_leaf():
        feature_index = node.features
        if sample_input[feature_index] == 1:
            node = node.right
        else:
            node = node.left 
    return np.argmax(node.values)

node = build_tree(x,y,list(range(len(y))),0,6)
feature_names = df.drop('Decision',axis=1).columns.to_list()
print_tree(node,feature_names,0)
sample = [1,1,0,0,1,0]
pred = predict(node,sample)
print(f"Predicition:{pred}")



Outlook_Overcast (gini = 0.4591836734693877) (samples = 14)
Left:
 Leaf node,Class distrubution:[0 4]
Left:
 Humidity (gini = 0.5) (samples = 10)
 Left:
  Outlook_Rain (gini = 0.375) (samples = 4)
  Left:
   Leaf node,Class distrubution:[0 1]
  Left:
   Leaf node,Class distrubution:[3 0]
 Left:
  Wind (gini = 0.4444444444444444) (samples = 6)
  Left:
   Temp (gini = 0.4444444444444444) (samples = 3)
   Left:
    Leaf node,Class distrubution:[0 1]
   Left:
    Leaf node,Class distrubution:[2 0]
  Left:
   Leaf node,Class distrubution:[0 3]
Predicition:1
