In [21]:
import numpy as np
import pandas as pd
import math 

file_path = 'C:/Users/2022503035/Documents/machine_learning_2022503035/decisiontree/dataset.xlsx'
df = pd.read_excel(file_path)
print(df)
target_data = df['CLASS:BUYS_COMPUTER']

            AGE  INCOME STUDENT CREDIT_RATING CLASS:BUYS_COMPUTER
0        youth     high      no          fair                  no
1        youth     high      no     excellent                  no
2   middle_aged    high      no          fair                 yes
3        senior  medium      no          fair                 yes
4        senior     low     yes          fair                 yes
5        senior     low     yes     excellent                  no
6   middle_aged     low     yes     excellent                 yes
7        youth   medium      no          fair                  no
8        youth      low     yes          fair                 yes
9        senior  medium     yes          fair                 yes
10       youth   medium     yes     excellent                 yes
11  middle_aged  medium      no     excellent                 yes
12  middle_aged    high     yes          fair                 yes
13       senior  medium      no     excellent                  no


In [22]:
def entropy (x) :
    class_count = {}
    for val in x:
        if val in class_count:
            class_count[val]+=1
        else:
            class_count[val] = 1
    total_instances = len(x)
    entropy = 0
    
    for count in class_count.values():
        prob = count/total_instances
        entropy -= prob * math.log2(prob)
    return entropy

dataentropy = entropy (target_data)
print(f"Entropy of the dataset is :{dataentropy:.4f}")
    

Entropy of the dataset is :0.9403


In [23]:
def info_gain(df, attribute, target_column):
    total_entropy = entropy(df[target_column])
    values = df[attribute].unique()
    # this helps find the unique values of a given attribute apparently 
    weight_entropy = 0
    # this is for the sum of weighted entropies of subsets 
    total_instances = len(df)
    # total number of rows in the dataset
    
    for value in values:
        subset = df[df[attribute] == value]
        subset_entropy = entropy(subset[target_column])
        weight = len(subset) / total_instances
        weight_entropy += weight* subset_entropy
    return total_entropy - weight_entropy

target_column = 'CLASS:BUYS_COMPUTER'
attributes = df.columns[:-1] 

info_gain_values = {}  

for attribute in attributes:
    info_gain_values[attribute] = info_gain(df, attribute, target_column)

# Print Information Gain for each attribute
print("\nInformation Gain for each attribute:")
for attr, ig in info_gain_values.items():
    print(f"{attr}: {ig:.4f}")

# Find the best attribute for splitting
best_attribute = max(info_gain_values, key=info_gain_values.get)
print(f"\nBest attribute for splitting: {best_attribute}")
        
    


Information Gain for each attribute:
AGE: 0.2467
INCOME: 0.0292
STUDENT: 0.1518
CREDIT_RATING: 0.0481

Best attribute for splitting: AGE


In [29]:
def id3(df, target_column, attributes):
    if len(df[target_column].unique()) == 1:
        return df[target_column].unique()[0]
    if len(attributes) == 0:
        return df[target_column].mode()[0] 

    info_gain_values = {attr: info_gain(df, attr, target_column) for attr in attributes}
    best_attribute = max(info_gain_values, key=info_gain_values.get)
    tree = {best_attribute: {}}
    remaining_attributes = [attr for attr in attributes if attr != best_attribute]
    
    for value in df[best_attribute].unique():
        subset = df[df[best_attribute] == value]
        tree[best_attribute][value] = id3(subset, target_column, remaining_attributes)

    return tree


def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "→ " + str(tree))  
        return
    for key, value in tree.items():
        print(indent + key)  
        for sub_key, sub_value in value.items():
            print(indent + f" ├── {sub_key}")  
            print_tree(sub_value, indent + " │   ")  


target_column = 'CLASS:BUYS_COMPUTER'
attributes = [col for col in df.columns if col not in ['RID', target_column]]

decision_tree = id3(df, target_column, attributes)

print_tree(decision_tree)


AGE
 ├── youth 
 │   STUDENT
 │    ├── no
 │    │   → no
 │    ├── yes
 │    │   → yes
 ├── middle_aged
 │   → yes
 ├── senior
 │   CREDIT_RATING
 │    ├── fair
 │    │   → yes
 │    ├── excellent
 │    │   → no


In [37]:
def gini_split(df, attribute, target_column):
    attribute_values = df[attribute].value_counts()
    total_instances = len(df)
    weighted_gini = 0
    for value in attribute_values.keys():
        subset = df[df[attribute] == value]
        subset_class_counts = subset[target_column].value_counts()
        gini_A = gini_impurity(subset_class_counts)
        weighted_gini += (len(subset) / total_instances) * gini_A
    
    return weighted_gini

# this is to compute ggini impurity for each attribute
gini_attribute = {}
for attr in df.columns:
    if attr != 'CLASS:BUYS_COMPUTER':
        gini_attribute[attr] = gini_split(df, attr, 'CLASS:BUYS_COMPUTER')
        print(f'Gini for {attr} is {gini_attribute[attr]:.3f}')




Gini for AGE is 0.343
Gini for INCOME is 0.440
Gini for STUDENT is 0.367
Gini for CREDIT_RATING is 0.429


In [39]:
def cart(df, target_column, attributes):

    if len(df[target_column].unique()) == 1:
        return df[target_column].unique()[0]
    if len(attributes) == 0:
        return df[target_column].mode()[0] 

   
    gini_values = {attr: gini_split(df, attr, target_column) for attr in attributes}
    # the best binary split is calculated based on the most min value
    best_attribute = min(gini_values, key=gini_values.get)

    # creatingg a subtree 
    tree = {best_attribute: {}}

    remaining_attributes = [attr for attr in attributes if attr != best_attribute]

    for value in df[best_attribute].unique():
        subset = df[df[best_attribute] == value]
        tree[best_attribute][value] = cart(subset, target_column, remaining_attributes)

    return tree

def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "→ " + str(tree))
        return
    for key, value in tree.items():
        print(indent + key) 
        for sub_key, sub_value in value.items():
            print(indent + f" ├── {sub_key}")  
            print_tree(sub_value, indent + " │   ")  


target_column = 'CLASS:BUYS_COMPUTER'
attributes = [col for col in df.columns if col not in ['RID', target_column]]

cart_tree = cart(df, target_column, attributes)

print("\nCART Decision Tree")
print_tree(cart_tree)

selected_attribute = min(gini_attribute, key=gini_attribute.get)
print(f'The selected attribute is: {selected_attribute}')


CART Decision Tree
AGE
 ├── youth 
 │   STUDENT
 │    ├── no
 │    │   → no
 │    ├── yes
 │    │   → yes
 ├── middle_aged
 │   → yes
 ├── senior
 │   CREDIT_RATING
 │    ├── fair
 │    │   → yes
 │    ├── excellent
 │    │   → no
The selected attribute is: AGE
