<a href="https://colab.research.google.com/github/Mansa8296/ML-Lab/blob/main/L2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from collections import Counter

class DecisionTree:
    def __init__(self):
        pass

    def entropy(self, y):
        class_counts = Counter(y)
        entropy = 0
        for label in class_counts:
            prob = class_counts[label] / len(y)
            entropy -= prob * np.log2(prob)
        return entropy

    def information_gain(self, X, y, feature_name):
        entropy_parent = self.entropy(y)
        unique_values = set(X[feature_name])
        entropy_children = 0
        for value in unique_values:
            subset_y = y[X[feature_name] == value]
            entropy_children += (len(subset_y) / len(y)) * self.entropy(subset_y)
        return entropy_parent - entropy_children

    def find_best_split(self, X, y, features):
        best_feature = None
        max_info_gain = -np.inf
        for feature in features:
            info_gain = self.information_gain(X, y, feature)
            if info_gain > max_info_gain:
                max_info_gain = info_gain
                best_feature = feature
        return best_feature

    def fit(self, X, y, features):
        self.tree = self.build_tree(X, y, features)

    def build_tree(self, X, y, features):
        if len(set(y)) == 1:  # If all samples have the same class, return a leaf node with that class
            return y[0]
        if len(features) == 0:  # If no features left to split on, return the majority class
            return Counter(y).most_common(1)[0][0]

        best_feature = self.find_best_split(X, y, features)
        tree = {best_feature: {}}
        remaining_features = [feature for feature in features if feature != best_feature]
        for value in set(X[best_feature]):
            subset_index = X[best_feature] == value
            subset_X = X[subset_index]
            subset_y = y[subset_index]
            tree[best_feature][value] = self.build_tree(subset_X, subset_y, remaining_features)
        return tree

    def predict_sample(self, sample):
        current_node = self.tree
        while isinstance(current_node, dict):
            feature = list(current_node.keys())[0]
            value = sample[feature]
            current_node = current_node[feature][value]
        return current_node

# Example usage:
X_train = pd.DataFrame({
    'Has Fur': ['Yes', 'Yes', 'No', 'No', 'No', 'No'],
    'Has Feathers': ['No', 'No', 'Yes', 'No', 'Yes', 'No'],
    'Lays Eggs': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes'],
    'Can Fly': ['No', 'No', 'Yes', 'No', 'Yes', 'No']
})

y_train = np.array(['Mammal', 'Mammal', 'Bird', 'Reptile', 'Bird', 'Reptile'])

features = ['Has Fur', 'Has Feathers', 'Lays Eggs', 'Can Fly']

tree = DecisionTree()
tree.fit(X_train, y_train, features)

# Classify a new sample
new_sample = {
    'Has Fur': 'Yes',
    'Has Feathers': 'No',
    'Lays Eggs': 'No',
    'Can Fly': 'No'
}
predicted_class = tree.predict_sample(new_sample)
print("Predicted class:", predicted_class)

Predicted class: Mammal
