In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_text

In [2]:
# Load the wine dataset
data = load_wine()

# Convert to DataFrame for easier handling
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target



# Split the dataset into features and target variable
X = df[data.feature_names].values
y = df['target'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [3]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [4]:
def entropy(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

def conditional_entropy(y, y_left, y_right):
    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)
    return p_left * entropy(y_left) + p_right * entropy(y_right)

def gini(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

def gini_split(y, y_left, y_right):
    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)
    return p_left * gini(y_left) + p_right * gini(y_right)

In [5]:

class DecisionTree:
    def __init__(self, max_depth=None, criterion='entropy', min_samples_leaf=1):
        self.max_depth = max_depth
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        self.tree = None
        self.feature_names = None

    def fit(self, X, y,feature_names=None):
        self.feature_names = data.feature_names
        self.tree = self._build_tree(X, y, 0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth) or len(y) < self.min_samples_leaf:
            return np.argmax(np.bincount(y))

        best_split = self._find_best_split(X, y)
        if not best_split:
            return np.argmax(np.bincount(y))

        left_indices, right_indices = best_split['indices']
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature': best_split['feature'], 'threshold': best_split['threshold'], 'left': left_subtree, 'right': right_subtree}

    def print_tree(self, node=None, depth=0):
        # Define print_tree function locally
        def print_tree(node, depth=0):
            if isinstance(node, dict):
                feature_name = self.feature_names[node['feature']] if self.feature_names else node['feature']
                print("  " * depth + f"[{feature_name} <= {node['threshold']}]")
                print_tree(node['left'], depth + 1)
                print_tree(node['right'], depth + 1)
            else:
                print("  " * depth + f"[Class: {node}]")
        # Call the local print_tree function to print the actual tree
        if node is None:
            node = self.tree
        print_tree(node, depth)



    def _find_best_split(self, X, y):
        if self.criterion == 'gini':
            return self._best_split_gini(X, y)
        elif self.criterion == 'entropy':
            return self._best_split_entropy(X, y)
        else:
            raise ValueError("Criterion not recognized.")

    def _best_split_gini(self, X, y):
        best_gini = float('inf')
        best_split = None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_index(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature': feature, 'threshold': threshold, 'indices': (left_indices, right_indices)}

        return best_split

    def _gini_index(self, left_y, right_y):
        total = len(left_y) + len(right_y)
        if total == 0:
            return 0

        p_left = len(left_y) / total
        p_right = len(right_y) / total

        gini_left = 1 - sum((np.bincount(left_y) / len(left_y)) ** 2) if len(left_y) > 0 else 0
        gini_right = 1 - sum((np.bincount(right_y) / len(right_y)) ** 2) if len(right_y) > 0 else 0

        return p_left * gini_left + p_right * gini_right

    def _best_split_entropy(self, X, y):
        best_entropy = float('inf')
        best_split = None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                entropy = self._entropy_index(y[left_indices], y[right_indices])

                if entropy < best_entropy:
                    best_entropy = entropy
                    best_split = {'feature': feature, 'threshold': threshold, 'indices': (left_indices, right_indices)}

        return best_split

    def _entropy_index(self, left_y, right_y):
        total = len(left_y) + len(right_y)
        if total == 0:
            return 0

        p_left = len(left_y) / total
        p_right = len(right_y) / total

        entropy_left = -sum((np.bincount(left_y) / len(left_y)) * np.log2(np.bincount(left_y) / len(left_y) + 1e-9)) if len(left_y) > 0 else 0
        entropy_right = -sum((np.bincount(right_y) / len(right_y)) * np.log2(np.bincount(right_y) / len(right_y) + 1e-9)) if len(right_y) > 0 else 0

        return p_left * entropy_left + p_right * entropy_right

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if not isinstance(tree, dict):
            return tree

        feature = tree['feature']
        threshold = tree['threshold']

        if sample[feature] <= threshold:
            return self._predict_sample(sample, tree['left'])
        else:
            return self._predict_sample(sample, tree['right'])

In [6]:
# Train and evaluate the custom decision tree with entropy
tree_entropy = DecisionTree(max_depth=2, criterion='entropy',min_samples_leaf=5)
tree_entropy.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_entropy = tree_entropy.predict(X_train)
y_pred_test_entropy = tree_entropy.predict(X_test)
# Calculate training and testing accuracy
accuracy_train_entropy = accuracy_score(y_train, y_pred_train_entropy)
accuracy_test_entropy = accuracy_score(y_test, y_pred_test_entropy)
# Print training and testing accuracy
print(f"\nCustom Decision Tree (Entropy) - Training Accuracy: {accuracy_train_entropy:.4f}")
print(f"Custom Decision Tree (Entropy) - Testing Accuracy: {accuracy_test_entropy:.4f}")

# Train and evaluate the custom decision tree with gini
tree_gini = DecisionTree(max_depth=2, criterion='gini',min_samples_leaf=5)
tree_gini.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_gini = tree_gini.predict(X_train)
y_pred_test_gini = tree_gini.predict(X_test)
# Calculate training and testing accuracy
accuracy_train_gini = accuracy_score(y_train, y_pred_train_gini)
accuracy_test_gini = accuracy_score(y_test, y_pred_test_gini)
# Print training and testing accuracy
print(f"\nCustom Decision Tree (Gini) - Training Accuracy: {accuracy_train_gini:.4f}")
print(f"Custom Decision Tree (Gini) - Testing Accuracy: {accuracy_test_gini:.4f}")



# Example usage
# Assuming X_train and y_train are your training data
dt = DecisionTree(max_depth=3)
dt.fit(X_train, y_train, feature_names=data.feature_names)  # Pass feature names
dt.print_tree()


Custom Decision Tree (Entropy) - Training Accuracy: 0.9437
Custom Decision Tree (Entropy) - Testing Accuracy: 0.9167

Custom Decision Tree (Gini) - Training Accuracy: 0.9366
Custom Decision Tree (Gini) - Testing Accuracy: 0.8611
[od280/od315_of_diluted_wines <= 2.15]
  [color_intensity <= 3.8]
    [Class: 1]
    [flavanoids <= 1.39]
      [Class: 2]
      [Class: 1]
  [alcohol <= 12.72]
    [Class: 1]
    [proline <= 630.0]
      [Class: 1]
      [Class: 0]


In [7]:
# Train and evaluate sklearn's decision tree with entropy
clf_entropy = DecisionTreeClassifier(max_depth=2, criterion='entropy', min_samples_leaf=5)  # Add min_samples_leaf
clf_entropy.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_sklearn_entropy = clf_entropy.predict(X_train)  # Predict on training data
y_pred_test_sklearn_entropy = clf_entropy.predict(X_test)  # Predict on testing data
# Calculate training and testing accuracy
accuracy_train_sklearn_entropy = accuracy_score(y_train, y_pred_train_sklearn_entropy)
accuracy_test_sklearn_entropy = accuracy_score(y_test, y_pred_test_sklearn_entropy)
# Print the tree structure using export text
print("\nDecision Tree (Entropy) Structure:")
tree_text = export_text(clf_entropy, feature_names=data.feature_names)
print(tree_text)
# Print training and testing accuracy
print(f"\nsklearn Decision Tree (Entropy) - Training Accuracy: {accuracy_train_sklearn_entropy:.4f}")
print(f"sklearn Decision Tree (Entropy) - Testing Accuracy: {accuracy_test_sklearn_entropy:.4f}")

# Train and evaluate sklearn's decision tree with gini
clf_gini = DecisionTreeClassifier(max_depth=2, criterion='gini', min_samples_leaf=5)  # Add min_samples_leaf
clf_gini.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_sklearn_gini = clf_gini.predict(X_train)  # Predict on training data
y_pred_test_sklearn_gini = clf_gini.predict(X_test)  # Predict on testing data
# Calculate training and testing accuracy
accuracy_train_sklearn_gini = accuracy_score(y_train, y_pred_train_sklearn_gini)
accuracy_test_sklearn_gini = accuracy_score(y_test, y_pred_test_sklearn_gini)
# Print the tree structure using export text
print("\nDecision Tree (Gini) Structure:")
tree_text = export_text(clf_gini, feature_names=data.feature_names)
print(tree_text)
# Print training and testing accuracy

print(f"\nsklearn Decision Tree (Gini) - Training Accuracy: {accuracy_train_sklearn_gini:.4f}")
print(f"sklearn Decision Tree (Gini) - Testing Accuracy: {accuracy_test_sklearn_gini:.4f}")


Decision Tree (Entropy) Structure:
|--- od280/od315_of_diluted_wines <= 2.19
|   |--- color_intensity <= 3.82
|   |   |--- class: 1
|   |--- color_intensity >  3.82
|   |   |--- class: 2
|--- od280/od315_of_diluted_wines >  2.19
|   |--- alcohol <= 12.79
|   |   |--- class: 1
|   |--- alcohol >  12.79
|   |   |--- class: 0


sklearn Decision Tree (Entropy) - Training Accuracy: 0.9437
sklearn Decision Tree (Entropy) - Testing Accuracy: 0.9167

Decision Tree (Gini) Structure:
|--- color_intensity <= 3.82
|   |--- proline <= 790.00
|   |   |--- class: 1
|   |--- proline >  790.00
|   |   |--- class: 1
|--- color_intensity >  3.82
|   |--- flavanoids <= 1.40
|   |   |--- class: 2
|   |--- flavanoids >  1.40
|   |   |--- class: 0


sklearn Decision Tree (Gini) - Training Accuracy: 0.9225
sklearn Decision Tree (Gini) - Testing Accuracy: 0.8611


In [8]:

# Create a dictionary to store the results
results = {
    'Model': ['Custom (Entropy)', 'Custom (Gini)', 'sklearn (Entropy)', 'sklearn (Gini)'],
    'Training Accuracy': [accuracy_train_entropy, accuracy_train_gini, accuracy_train_sklearn_entropy, accuracy_train_sklearn_gini],
    'Testing Accuracy': [accuracy_test_entropy, accuracy_test_gini, accuracy_test_sklearn_entropy, accuracy_test_sklearn_gini]
}

# Create a pandas DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the table
display(results_df)

Unnamed: 0,Model,Training Accuracy,Testing Accuracy
0,Custom (Entropy),0.943662,0.916667
1,Custom (Gini),0.93662,0.861111
2,sklearn (Entropy),0.943662,0.916667
3,sklearn (Gini),0.922535,0.861111
