<a href="https://colab.research.google.com/github/Nachiketha237/BMSCE_LAB/blob/main/6th_sem/ML_Lab/week2/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np



In [13]:
class Node():

    def __init__(self, feature=None, threshold=None, left=None, right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value

In [14]:


class DecisionTree():
    def __init__(self, min_samples=2, max_depth=2):
        self.min_samples = min_samples
        self.max_depth = max_depth

    def split_data(self, dataset, feature, threshold):
        left_dataset = dataset[dataset[:, feature] <= threshold]
        right_dataset = dataset[dataset[:, feature] > threshold]
        return left_dataset, right_dataset

    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def information_gain(self, parent, left, right):
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        entropy_left = self.entropy(left[:, -1])
        entropy_right = self.entropy(right[:, -1])
        weighted_entropy = weight_left * entropy_left + weight_right * entropy_right
        parent_entropy = self.entropy(parent[:, -1])
        information_gain = parent_entropy - weighted_entropy
        return information_gain

    def best_split(self, dataset):
        best_gain = 0
        best_feature = None
        best_threshold = None
        n_samples, n_features = dataset.shape
        parent_entropy = self.entropy(dataset[:, -1])

        for feature_index in range(n_features - 1):
            thresholds = np.unique(dataset[:, feature_index])
            for threshold in thresholds:
                left_dataset, right_dataset = self.split_data(dataset, feature_index, threshold)
                if len(left_dataset) == 0 or len(right_dataset) == 0:
                    continue
                gain = self.information_gain(dataset, left_dataset, right_dataset)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def build_tree(self, dataset, depth=0):
        if depth >= self.max_depth or len(dataset) < self.min_samples:
            values, counts = np.unique(dataset[:, -1], return_counts=True)
            return values[np.argmax(counts)]

        best_feature, best_threshold = self.best_split(dataset)
        if best_feature is None:
            values, counts = np.unique(dataset[:, -1], return_counts=True)
            return values[np.argmax(counts)]

        left_dataset, right_dataset = self.split_data(dataset, best_feature, best_threshold)
        left_subtree = self.build_tree(left_dataset, depth + 1)
        right_subtree = self.build_tree(right_dataset, depth + 1)

        return (best_feature, best_threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        dataset = np.column_stack((X, y))
        self.tree = self.build_tree(dataset)

    def predict(self, X):
        predictions = []
        for x in X:
            node = self.tree
            while isinstance(node, tuple):
                feature, threshold, left_subtree, right_subtree = node
                if x[feature] <= threshold:
                    node = left_subtree
                else:
                    node = right_subtree
            predictions.append(node)
        return predictions


In [15]:
def train_test_split(X, y, random_state=41, test_size=0.2):

    n_samples = X.shape[0]

    np.random.seed(random_state)

    shuffled_indices = np.random.permutation(np.arange(n_samples))

    test_size = int(n_samples * test_size)

    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

In [16]:
def accuracy(y_true, y_pred):
    y_true = y_true.flatten()
    total_samples = len(y_true)
    correct_predictions = np.sum(y_true == y_pred)
    return (correct_predictions / total_samples)

In [17]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.2)

In [18]:
model = DecisionTree(2, 2)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(f"Model's Accuracy: {accuracy(y_test, predictions)}")

Model's Accuracy: 0.8333333333333334


In [25]:
X_test1 =[6.1,2.8,4.2,3.1]
prediction = model.predict(X_test)