<a href="https://colab.research.google.com/github/Is7ac/ML/blob/main/lab4_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_text

In [2]:
# Data
Ins = [("T", "T", 5.0), ("T", "T", 7.0), ("T", "F", 8.0), ("F", "F", 3.0), ("F", "T", 7.0),
       ("F", "T", 4.0), ("F", "F", 5.0), ("T", "F", 6.0), ("F", "T", 1.0)]
classes = ["Y", "Y", "N", "Y", "N", "N", "N", "Y", "N"]

In [3]:
def entropy(class_list):
    total = len(class_list)
    class_counts = {}
    for c in class_list:
        if c not in class_counts:
            class_counts[c] = 0
        class_counts[c] += 1
    entropy_value = 0
    for count in class_counts.values():
        p = count / total
        entropy_value -= p * math.log2(p)
    return entropy_value

In [4]:
def gini_index(class_list):
    total = len(class_list)
    class_counts = {}
    for c in class_list:
        if c not in class_counts:
            class_counts[c] = 0
        class_counts[c] += 1
    gini_value = 1
    for count in class_counts.values():
        p = count / total
        gini_value -= p ** 2
    return gini_value

In [5]:
def info_gain(data, classes, split_function):
    total_entropy = entropy(classes)
    subsets = split_function(data, classes)
    subset_entropy = 0
    total_length = len(classes)
    for subset in subsets:
        subset_entropy += (len(subset) / total_length) * entropy([classes[i] for i in subset])
    return total_entropy - subset_entropy


In [6]:
def gini_gain(data, classes, split_function):
    total_gini = gini_index(classes)
    subsets = split_function(data, classes)
    subset_gini = 0
    total_length = len(classes)
    for subset in subsets:
        subset_gini += (len(subset) / total_length) * gini_index([classes[i] for i in subset])
    return total_gini - subset_gini

In [7]:
def split_by_first_attribute(data, classes):
    true_indices = [i for i, instance in enumerate(data) if instance[0] == "T"]
    false_indices = [i for i, instance in enumerate(data) if instance[0] == "F"]
    return [true_indices, false_indices]


In [8]:
def split_by_second_attribute(data, classes):
    true_indices = [i for i, instance in enumerate(data) if instance[1] == "T"]
    false_indices = [i for i, instance in enumerate(data) if instance[1] == "F"]
    return [true_indices, false_indices]

In [9]:
def split_by_numerical_attribute(data, classes, threshold):
    less_equal_indices = [i for i, instance in enumerate(data) if instance[2] <= threshold]
    greater_indices = [i for i, instance in enumerate(data) if instance[2] > threshold]
    return [less_equal_indices, greater_indices]

In [10]:
# Calculate Information Gain and Gini Gain for first and second attributes
ig_first = info_gain(Ins, classes, split_by_first_attribute)
ig_second = info_gain(Ins, classes, split_by_second_attribute)

gg_first = gini_gain(Ins, classes, split_by_first_attribute)
gg_second = gini_gain(Ins, classes, split_by_second_attribute)

In [11]:
# Calculate Information Gain and Gini Gain for numerical attribute
# We need to test multiple thresholds for the numerical attribute
thresholds = sorted(set([x[2] for x in Ins]))
ig_numerical = max(info_gain(Ins, classes, lambda data, cls: split_by_numerical_attribute(data, cls, threshold)) for threshold in thresholds)
gg_numerical = max(gini_gain(Ins, classes, lambda data, cls: split_by_numerical_attribute(data, cls, threshold)) for threshold in thresholds)

print(f"Information Gain for first attribute: {ig_first}")
print(f"Information Gain for second attribute: {ig_second}")
print(f"Information Gain for numerical attribute: {ig_numerical}")

print(f"Gini Gain for first attribute: {gg_first}")
print(f"Gini Gain for second attribute: {gg_second}")
print(f"Gini Gain for numerical attribute: {gg_numerical}")

Information Gain for first attribute: 0.22943684069673975
Information Gain for second attribute: 0.007214618474517431
Information Gain for numerical attribute: 0.10218717094933338
Gini Gain for first attribute: 0.1493827160493828
Gini Gain for second attribute: 0.00493827160493826
Gini Gain for numerical attribute: 0.04938271604938271


In [13]:
pip install scikit-learn



In [14]:

# Data
Ins = [("T", "T", 5.0), ("T", "T", 7.0), ("T", "F", 8.0), ("F", "F", 3.0), ("F", "T", 7.0),
       ("F", "T", 4.0), ("F", "F", 5.0), ("T", "F", 6.0), ("F", "T", 1.0)]
classes = ["Y", "Y", "N", "Y", "N", "N", "N", "Y", "N"]



In [18]:

# Convert categorical data to numerical data
data = []
for instance in Ins:
    data.append([
        1 if instance[0] == "T" else 0,  # Convert first attribute to binary
        1 if instance[1] == "T" else 0,  # Convert second attribute to binary
        instance[2]  # Numerical value
    ])

# Convert class labels to binary
class_labels = [1 if c == "Y" else 0 for c in classes]



In [19]:
# Fit the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(data, class_labels)

# Print the decision tree
tree_rules = export_text(clf, feature_names=["First Attribute", "Second Attribute", "Numerical Attribute"])
print(tree_rules)

# Print feature importances
feature_importances = clf.feature_importances_
print("Feature importances:", feature_importances)

|--- First Attribute <= 0.50
|   |--- Second Attribute <= 0.50
|   |   |--- Numerical Attribute <= 4.00
|   |   |   |--- class: 1
|   |   |--- Numerical Attribute >  4.00
|   |   |   |--- class: 0
|   |--- Second Attribute >  0.50
|   |   |--- class: 0
|--- First Attribute >  0.50
|   |--- Numerical Attribute <= 7.50
|   |   |--- class: 1
|   |--- Numerical Attribute >  7.50
|   |   |--- class: 0

Feature importances: [0.3025 0.135  0.5625]
