In [1]:
import pandas as pd
import numpy as np
from math import log2

# Load the dataset
data = pd.read_csv('ContactLens.csv')
data.head()

Unnamed: 0,Age,SpectaclePrescrip,Astigmatism,TearProdRate,ContactLens
0,young,myope,no,reduced,none
1,young,myope,no,normal,soft
2,young,myope,yes,reduced,none
3,young,myope,yes,normal,hard
4,young,hypermetrope,no,reduced,none


In [4]:
# Define a function to calculate the entropy of a dataset
def entropy(data):
    target = 'ContactLens'
    values = data[target].unique()
    entropy = 0
    for value in values:
        p = len(data[data[target] == value]) / len(data)
        entropy -= p * log2(p)
    return entropy

# Define a function to calculate the information gain of a split
def information_gain(data, feature):
    target = 'ContactLens'
    target_entropy = entropy(data)
    values = data[feature].unique()
    feature_entropy = 0
    for value in values:
        p = len(data[data[feature] == value]) / len(data)
        feature_data = data[data[feature] == value]
        feature_entropy += p * entropy(feature_data)
    return target_entropy - feature_entropy

# Define the ID3 algorithm
def id3(data, features, target):
    # If all target values are the same, return that value
    if len(data[target].unique()) == 1:
        return data[target].iloc[0]
    # If there are no more features to split on, return the most common target value
    if len(features) == 0:
        return data[target].value_counts().idxmax()
    # Otherwise, choose the best feature to split on
    gains = [information_gain(data, feature) for feature in features]
    best_feature_index = gains.index(max(gains))
    best_feature = features[best_feature_index]
    # Create a new decision tree node with the chosen feature
    tree = {best_feature: {}}
    features = [feature for feature in features if feature != best_feature]
    # Recursively grow the tree using the subsets of the data
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        subtree = id3(subset, features, target)
        tree[best_feature][value] = subtree
    return tree

# Run the ID3 algorithm on the dataset
features = list(data.columns[:-1])
target = 'ContactLens'
tree = id3(data, features, target)

# Print the decision tree
print(tree)

{'TearProdRate': {'reduced': 'none', 'normal': {'Astigmatism': {'no': {'Age': {'young': 'soft', 'pre-presbyopic': 'soft', 'presbyopic': {'SpectaclePrescrip': {'myope': 'none', 'hypermetrope': 'soft'}}}}, 'yes': {'SpectaclePrescrip': {'myope': 'hard', 'hypermetrope': {'Age': {'young': 'hard', 'pre-presbyopic': 'none', 'presbyopic': 'none'}}}}}}}}


In [5]:
def print_tree(tree, indent=''):
    if isinstance(tree, dict):
        print(f'{indent}{list(tree.keys())[0]}')
        for key, value in tree[list(tree.keys())[0]].items():
            print(f'{indent}  {key}')
            print_tree(value, indent + '    ')
    else:
        print(f'{indent}==> {tree}')
        
print_tree(tree)

TearProdRate
  reduced
    ==> none
  normal
    Astigmatism
      no
        Age
          young
            ==> soft
          pre-presbyopic
            ==> soft
          presbyopic
            SpectaclePrescrip
              myope
                ==> none
              hypermetrope
                ==> soft
      yes
        SpectaclePrescrip
          myope
            ==> hard
          hypermetrope
            Age
              young
                ==> hard
              pre-presbyopic
                ==> none
              presbyopic
                ==> none


In [6]:
def shannon_entropy(data):
    # Кількість елементів у вибірці
    n = len(data)
    # Підрахунок кількості унікальних значень у вибірці
    unique, counts = np.unique(data, return_counts=True)
    # Розрахунок частоти кожного унікального значення у вибірці
    freqs = counts / n
    # Розрахунок ентропії методом Шеннона
    entropy = -np.sum(freqs * np.log2(freqs))
    return entropy

entropy = shannon_entropy(data)
print(f"Ентропія даних: {entropy}")

Ентропія даних: 5.911050026085454


In [7]:
# обчислення ентропії для всього датасету
def entropy(data):
    # знаходимо всі унікальні значення класів
    classes = set([row[-1] for row in data])
    n_instances = float(len(data))
    entropy = 0.0
    for cls in classes:
        # знаходимо кількість елементів у класі
        count = [row[-1] for row in data].count(cls)
        # обчислюємо відносну частоту класу
        prob = count / n_instances
        # обчислюємо ентропію для класу
        entropy += (-prob) * log2(prob)
    return entropy

print(entropy(data))

0.8718671876502407
