In [2]:
import pandas as pd
import numpy as np
import random

# Define the dataset
data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny',
                'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild',
                    'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal',
                 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Windy': [False, True, False, False, False, True, True, False, False, False, True, True, False,
              True],
    'Play Tennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
                    'No']
}

df = pd.DataFrame(data)

# Function to calculate entropy
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for
                            i in range(len(elements))])
    return entropy_val

# Function to calculate information gain
def information_gain(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                               entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for
                               i in range(len(vals))])
    information_gain_val = total_entropy - weighted_entropy
    return information_gain_val

# Function for ID3 Algorithm (recursive decision tree construction)
def id3_algorithm(data, original_data, features, target_attribute_name, parent_node_class):
    # Base cases
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(
            np.unique(original_data[target_attribute_name], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class
    else:
        # Set the parent node class to the majority class
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(
            np.unique(data[target_attribute_name], return_counts=True)[1])]

        # Calculate information gain for all features
        item_values = [information_gain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        # Create the tree with the best feature as the root node
        tree = {best_feature: {}}

        # Remove the best feature from the list of features to consider for further splits
        features = [i for i in features if i != best_feature]

        # For each value of the best feature, create subtrees
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3_algorithm(sub_data, original_data, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
        return tree

# Function to make predictions
def predict(query, tree, default='Yes'):
    for key in query.keys():
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except:
                return default
            if isinstance(result, dict):
                return predict(query, result, default)
            else:
                return result
    return default

# Function to split the data into train and test sets
def train_test_split(df, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

# Function to train the model using ID3 algorithm
def fit(df, target_attribute_name, features):
    return id3_algorithm(df, df, features, target_attribute_name, None)

# Function to calculate the accuracy of the model
def get_accuracy(df, tree, target_attribute_name):
    df["classification"] = df.apply(lambda row: predict(row, tree), axis=1)
    df["classification_correct"] = df["classification"] == df[target_attribute_name]
    accuracy = df["classification_correct"].mean()
    return accuracy

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2)

# Train the decision tree on the training data
tree = fit(train_data, 'Play Tennis', ['Weather', 'Temperature', 'Humidity', 'Windy'])

# Calculate the accuracy of the decision tree on the test data
accuracy = get_accuracy(test_data, tree, 'Play Tennis')

# Print the decision tree and accuracy
print("Decision Tree:")
print(tree)
print("Accuracy:", accuracy)


Decision Tree:
{'Weather': {'Overcast': 'Yes', 'Rainy': {'Windy': {False: 'Yes', True: 'No'}}, 'Sunny': {'Temperature': {'Hot': 'No', 'Mild': 'Yes'}}}}
Accuracy: 0.6666666666666666
