## Step 1: Load the Dataset

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['species'] = iris.target
data['species'] = data['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Display the first few rows of the dataset
print(data.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


## Step 2: Implement the ID3 Algorithm with Proper Encoding
We'll use label encoding for categorical features and handle the species column correctly.

In [2]:
import numpy as np

def entropy(s):
    # Calculate the entropy of a dataset
    _, counts = np.unique(s, return_counts=True)
    probabilities = counts / len(s)
    return -sum(prob * np.log2(prob) for prob in probabilities if prob > 0)

def information_gain(data, feature, target):
    # Calculate the information gain of a feature
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * entropy(data[data[feature] == values[i]][target]) for i in range(len(values)))
    return total_entropy - weighted_entropy

def id3(data, original_data, features, target, parent_node_class=None):
    # ID3 algorithm implementation
    if len(np.unique(data[target])) == 1:
        return np.unique(data[target])[0]
    elif len(data) == 0:
        return np.unique(original_data[target])[np.argmax(np.unique(original_data[target], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target])[np.argmax(np.unique(data[target], return_counts=True)[1])]
        item_values = [information_gain(data, feature, target) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = id3(sub_data, original_data, features, target, parent_node_class)
            tree[best_feature][value] = subtree
        return tree

## Step 3: Train the Decision Tree

In [3]:
features = data.columns[:-1]
target = 'species'

# Build the decision tree
tree = id3(data, data, features, target)

# Print the decision tree
import pprint
pprint.pprint(tree)

{'petal length (cm)': {1.0: 'setosa',
                       1.1: 'setosa',
                       1.2: 'setosa',
                       1.3: 'setosa',
                       1.4: 'setosa',
                       1.5: 'setosa',
                       1.6: 'setosa',
                       1.7: 'setosa',
                       1.9: 'setosa',
                       3.0: 'versicolor',
                       3.3: 'versicolor',
                       3.5: 'versicolor',
                       3.6: 'versicolor',
                       3.7: 'versicolor',
                       3.8: 'versicolor',
                       3.9: 'versicolor',
                       4.0: 'versicolor',
                       4.1: 'versicolor',
                       4.2: 'versicolor',
                       4.3: 'versicolor',
                       4.4: 'versicolor',
                       4.5: {'sepal length (cm)': {4.9: 'virginica',
                                                   5.4: 'versicolor',
               

## Step 4: Classify a New Sample
We'll write a function to classify new samples using the constructed decision tree.

In [4]:
def classify(sample, tree):
    for attribute in tree.keys():
        value = sample[attribute]
        if value in tree[attribute]:
            subtree = tree[attribute][value]
            if isinstance(subtree, dict):
                return classify(sample, subtree)
            else:
                return subtree

# New sample to classify (example)
new_sample = {'sepal length (cm)': 5.1, 'sepal width (cm)': 3.5, 'petal length (cm)': 1.4, 'petal width (cm)': 0.2}

# Classify the new sample
prediction = classify(new_sample, tree)
print(f"The new sample is classified as: {prediction}")

The new sample is classified as: setosa
