<a href="https://colab.research.google.com/github/Is7ac/ML/blob/main/lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

In [19]:
# Define the dataset
X = np.array([[25, 1], [20, 0], [25, 1], [45, 2], [20, 1], [25, 2]])  # Age, Car (0: Vintage, 1: Sports, 2: SUV)
y = np.array(['L', 'H', 'L', 'H', 'H', 'H'])  # Risk (H: High, L: Low)

In [20]:
# Function to calculate entropy
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Function to calculate information gain
def information_gain(X, y, feature_index, split_value):
    # Split data based on the feature and split value
    X_left = X[X[:, feature_index] <= split_value]
    X_right = X[X[:, feature_index] > split_value]
    y_left = y[X[:, feature_index] <= split_value]
    y_right = y[X[:, feature_index] > split_value]

    # Calculate entropy for the subsets
    entropy_parent = entropy(y)
    entropy_left = entropy(y_left)
    entropy_right = entropy(y_right)

    # Calculate information gain
    total_instances = len(y)
    instances_left = len(y_left)
    instances_right = len(y_right)
    information_gain = entropy_parent - ((instances_left / total_instances) * entropy_left + (instances_right / total_instances) * entropy_right)

    return information_gain



In [21]:
# Calculate the information gain for each feature
information_gains = []
for feature_index in range(X.shape[1]):
    unique_values = np.unique(X[:, feature_index])
    for value in unique_values:
        gain = information_gain(X, y, feature_index, value)
        information_gains.append((feature_index, value, gain))

# Select the feature with the highest information gain
best_split = max(information_gains, key=lambda x: x[2])
best_feature_index, best_split_value, best_gain = best_split

# Construct the decision tree
decision_tree = {
    'feature_index': best_feature_index,
    'split_value': best_split_value,
    'left': {'class': 'L'},  # Low risk
    'right': {'class': 'H'}  # High risk
}

# Classify the point (Age=27, Car=Vintage)
def classify(point, tree):
    if point[tree['feature_index']] <= tree['split_value']:
        if 'class' in tree['left']:
            return tree['left']['class']
        else:
            return classify(point, tree['left'])
    else:
        if 'class' in tree['right']:
            return tree['right']['class']
        else:
            return classify(point, tree['right'])




In [22]:
# Test classification for the given point
point_to_classify = np.array([27, 0])  # Age=27, Car=Vintage
classification = classify(point_to_classify, decision_tree)
print("Classification for point (Age=27, Car=Vintage):", classification)

Classification for point (Age=27, Car=Vintage): H


a. True: High entropy means uncertainty or disorder in the dataset, which implies that the partitions in classification are not pure. When entropy is high, it indicates that there is a mix of different classes in the dataset, thus the partitions are not pure.
b. False: A low Gini index does not necessarily mean that a node or leaf is pure. The Gini index measures the impurity of a node in a decision tree. While a low Gini index suggests less impurity, it does not guarantee purity, especially if there are multiple classes present in the node. A Gini index of 0 would indicate perfect purity, but it's not the only factor to consider.