In [12]:
# Import necessary libraries
import pandas as pd  # For handling the dataset
import numpy as np  # For numerical operations
from math import log2  # For calculating logarithms

# Load the dataset
file_path = '/content/id3.csv'  # Path to the uploaded dataset
data = pd.read_csv(file_path)  # Read the CSV file into a pandas DataFrame

# Function to calculate entropy
def entropy(data, target_attr):
    values, counts = np.unique(data[target_attr], return_counts=True)  # Find unique classes and their counts
    entropy_value = 0  # Start with 0 entropy
    for i in range(len(values)):  # Loop through each class
        proportion = counts[i] / sum(counts)  # Fraction of rows for this class
        entropy_value += -proportion * log2(proportion)  # Add entropy for this class
    return entropy_value


# Function to calculate information gain
def information_gain(data, feature, target_attr):
    # Calculate the entropy of the entire dataset
    total_entropy = entropy(data, target_attr)
    # Get the unique values and their counts for the feature
    values, counts = np.unique(data[feature], return_counts=True)
    # Calculate the weighted entropy for each value of the feature
    weighted_entropy = sum(
        (counts[i] / sum(counts)) * entropy(data[data[feature] == values[i]], target_attr)
        for i in range(len(values))
    )
    # Information gain is the reduction in entropy
    gain = total_entropy - weighted_entropy
    return gain

# Function to build the decision tree using the ID3 algorithm
def id3(data, features, target_attr, parent_node_class=None):

    # Base case 1: If all target values are the same, return that class
    if len(np.unique(data[target_attr])) == 1:
        return np.unique(data[target_attr])[0]

    # Base case 2: If dataset is empty or no features left, return the majority class
    elif len(data) == 0 or len(features) == 0:
        return parent_node_class

    # Determine the majority class of the current data
    parent_node_class = data[target_attr].mode()[0]

    # Choose the best feature to split on using information gain
    gains = {feature: information_gain(data, feature, target_attr) for feature in features}
    best_feature = max(gains, key=gains.get)

    # Create the tree structure with the best feature as the root
    tree = {best_feature: {}}
    # Remove the best feature from the list of features
    features = [f for f in features if f != best_feature]

    # Split the dataset and build subtrees recursively for each value of the best feature
    for value in np.unique(data[best_feature]):
        # Subset of the data for the current value of the feature
        subset = data[data[best_feature] == value]
        # Recursively build the subtree
        subtree = id3(subset, features, target_attr, parent_node_class)
        # Add the subtree to the tree
        tree[best_feature][value] = subtree

    return tree

# Define the features and the target attribute
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']  # Predictors
target_attr = 'PlayTennis'  # Target variable

# Build the decision tree
decision_tree = id3(data, features, target_attr)
# Function to print the decision tree in a tree-like structure
def print_tree(tree, depth=0):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print(f"{'|   ' * depth}{key}")
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    print(f"{'|   ' * (depth + 1)}{sub_key} ->", end=" ")
                    print_tree(sub_value, depth + 2)
            else:
                print_tree(value, depth + 1)
    else:
        print(f"{tree}")


# Print the resulting decision tree in a readable format
print("Decision Tree:")
print_tree(decision_tree)



Decision Tree:
Outlook
|   overcast -> yes
|   rain -> |   |   Wind
|   |   |   strong -> no
|   |   |   weak -> yes
|   sunny -> |   |   Humidity
|   |   |   high -> no
|   |   |   normal -> yes
