<a href="https://colab.research.google.com/github/IsaacFigNewton/Alcatraz-Landscape-Construction-Website/blob/master/Modified_ID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [12]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

#Important Functions

Based on code from serengil's repository
https://github.com/serengil/decision-trees-for-ml/tree/master

In [16]:
def calculateEntropy(df):
    """
    Calculate the entropy of the given dataset.

    Args:
    df (pandas.DataFrame): The dataset containing the 'Decision' column.

    Returns:
    float: The calculated entropy value.
    """
    instances = df.shape[0]
    decisions = df['Decision'].value_counts().keys().tolist()
    entropy = 0

    for decision in decisions:
        num_of_decisions = df['Decision'].value_counts()[decision]
        class_probability = num_of_decisions / instances
        entropy -= class_probability * np.log2(class_probability) if class_probability > 0 else 0

    return entropy

In [17]:
def findDecision(df):
    """
    Find the best feature to split the dataset on using the ID3 algorithm.

    Args:
    df (pandas.DataFrame): The dataset to analyze.

    Returns:
    str: The name of the feature with the highest information gain.
    """
    entropy = calculateEntropy(df)
    columns = df.shape[1]
    instances = df.shape[0]
    gains = []

    for i in range(columns - 1):
        column_name = df.columns[i]
        classes = df[column_name].value_counts()
        gain = entropy

        for current_class in classes.keys():
            subdataset = df[df[column_name] == current_class]
            subset_instances = subdataset.shape[0]
            class_probability = subset_instances / instances
            subset_entropy = calculateEntropy(subdataset)
            gain -= class_probability * subset_entropy

        gains.append(gain)

    winner_index = gains.index(max(gains))
    winner_name = df.columns[winner_index]
    return winner_name

In [18]:
def buildDecisionTree(df):
    """
    Recursively build the ID3 decision tree.

    Args:
    df (pandas.DataFrame): The dataset to build the tree from.

    Returns:
    dict: A dictionary representing the decision tree.
    """
    if len(df['Decision'].unique()) == 1:
        return df['Decision'].iloc[0]

    if df.shape[1] == 1:
        return df['Decision'].value_counts().idxmax()

    winner_name = findDecision(df)
    tree = {winner_name: {}}

    for current_class in df[winner_name].unique():
        subdataset = df[df[winner_name] == current_class].drop(columns=[winner_name])
        tree[winner_name][current_class] = buildDecisionTree(subdataset)

    return tree

In [19]:
def predict(tree, instance):
    """
    Make a prediction for a single instance using the decision tree.

    Args:
    tree (dict): The decision tree.
    instance (pandas.Series): A single instance to classify.

    Returns:
    str: The predicted class.
    """
    if not isinstance(tree, dict):
        return tree

    root = list(tree.keys())[0]
    if instance[root] in tree[root]:
        return predict(tree[root][instance[root]], instance)
    else:
        # If the value is not in the tree, return the most common class
        return max(tree[root], key=lambda x: len(tree[root][x]))

In [45]:
def print_tree(tree, indent=0):
    for key, value in tree.items():
        print(' ' * indent + str(key))
        if isinstance(value, dict):
            print_tree(value, indent + 4)
        else:
            print(' ' * (indent + 4) + str(value))

# Run code

In [48]:
# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/serengil/decision-trees-for-ml/master/dataset/golf.txt")

# Prepare for k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Lists to store evaluation metrics
precisions, recalls, accuracies, f1_scores = [], [], [], []

for fold, (train_index, test_index) in enumerate(kf.split(df), 1):
    print(f"Fold {fold}")

    # Split the data
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]

    # Build the decision tree
    tree = buildDecisionTree(train_data)

    # Print the decision tree
    print_tree(tree)
    print()

    # Make predictions
    y_true = test_data['Decision']
    y_pred = test_data.apply(lambda x: predict(tree, x), axis=1)

    # Calculate metrics using sklearn
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1)

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()

# Print average scores
print()
print("Average Scores:")
print(f"Precision: {np.mean(precisions):.4f} (+/- {np.std(precisions):.4f})")
print(f"Recall: {np.mean(recalls):.4f} (+/- {np.std(recalls):.4f})")
print(f"Accuracy: {np.mean(accuracies):.4f} (+/- {np.std(accuracies):.4f})")
print(f"F1 Score: {np.mean(f1_scores):.4f} (+/- {np.std(f1_scores):.4f})")

Fold 1
Outlook
    Sunny
        Humidity
            High
                No
            Normal
                Yes
    Overcast
        Yes
    Rain
        Wind
            Weak
                Yes
            Strong
                No

Precision: 1.0000
Recall: 1.0000
Accuracy: 1.0000
F1 Score: 1.0000

Fold 2
Outlook
    Sunny
        Humidity
            High
                No
            Normal
                Yes
    Overcast
        Yes
    Rain
        Wind
            Weak
                Yes
            Strong
                No

Precision: 1.0000
Recall: 1.0000
Accuracy: 1.0000
F1 Score: 1.0000

Fold 3
Outlook
    Sunny
        Humidity
            High
                No
            Normal
                Yes
    Rain
        Wind
            Weak
                Yes
            Strong
                No
    Overcast
        Yes

Precision: 1.0000
Recall: 1.0000
Accuracy: 1.0000
F1 Score: 1.0000

Fold 4
Outlook
    Sunny
        Temp.
            Hot
                No
  