In [None]:
# <================================= Feature Selection n Decision Tree Hyperparameter Optimization (Using PSO) ====================================>

import numpy as np
import random
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd


# Decision Tree Class Implementation
class DecisionTree:

    def __init__(self, max_depth=None, min_samples_split=None, min_samples_leaf=None, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features

    def gini(self, labels):
        unique, counts = np.unique(labels, return_counts=True)
        proportions = counts / np.sum(counts)
        return 1 - np.sum(proportions ** 2)

    def split(self, X, y, feature_idx, threshold):
        left_idxs = np.where(X[:, feature_idx] <= threshold)[0]
        right_idxs = np.where(X[:, feature_idx] > threshold)[0]
        left_X, left_y = X[left_idxs], y[left_idxs]
        right_X, right_y = X[right_idxs], y[right_idxs]
        return left_X, left_y, right_X, right_y

    def best_split(self, X, y):
        best_feature_idx, best_threshold, best_gini = None, None, np.inf
        features = np.arange(X.shape[1])
        if self.max_features is not None and self.max_features <= X.shape[1]:
            features = np.random.choice(features, size=int(self.max_features), replace=False)
        for feature_idx in features:
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_X, left_y, right_X, right_y = self.split(X, y, feature_idx, threshold)
                if len(left_y) == 0 or len(right_y) == 0:
                    continue
                if self.min_samples_leaf is not None:
                    if len(left_y) < self.min_samples_leaf or len(right_y) < self.min_samples_leaf:
                        continue
                left_gini = self.gini(left_y)
                right_gini = self.gini(right_y)
                weighted_gini = (len(left_y) / len(y)) * left_gini + (len(right_y) / len(y)) * right_gini
                if weighted_gini < best_gini:
                    best_feature_idx = feature_idx
                    best_threshold = threshold
                    best_gini = weighted_gini
        return best_feature_idx, best_threshold

    def fit(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.bincount(y).argmax()
        if self.min_samples_split is not None and len(y) < self.min_samples_split:
            return np.bincount(y).argmax()
        if self.min_samples_leaf is not None and len(y) < 2 * self.min_samples_leaf:
            return np.bincount(y).argmax()
        feature_idx, threshold = self.best_split(X, y)
        if feature_idx is None:
            return np.bincount(y).argmax()
        left_X, left_y, right_X, right_y = self.split(X, y, feature_idx, threshold)
        left_subtree = self.fit(left_X, left_y, depth + 1)
        right_subtree = self.fit(right_X, right_y, depth + 1)
        return {'feature_idx': feature_idx, 'threshold': threshold, 'left': left_subtree, 'right': right_subtree}

    def predict(self, X, tree):
        if isinstance(tree, np.int64):
            return tree
        feature_idx, threshold = tree['feature_idx'], tree['threshold']
        if X[feature_idx] <= threshold:
            if isinstance(tree['left'], np.int64):
                return tree['left']
            else:
                return self.predict(X, tree['left'])
        else:
            if isinstance(tree['right'], np.int64):
                return tree['right']
            else:
                return self.predict(X, tree['right'])



# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





# <=================================== Decision Tree Without FS and Without HP Optimization ========================>
dt = DecisionTree()
tree_dt = dt.fit(X_train, y_train)
y_pred = [dt.predict(X, tree_dt) for X in X_test]

print("<================= Decision Tree without HP and without FS Optimization ===========================>")
accuracy = sum(y_pred == y_test) / len(y_test)
print("Accuracy without HP and without FS: ",  accuracy, "\n\n")







# <========================================== Decision Tree With FS Optimization ==================================>
# Define the fitness function
def fitness_func(features):
    # Select the subset of features
    selected_features = X_train[:, np.array(features) == 1]

    clf = DecisionTree()
    tree = clf.fit(selected_features, y_train)

    selected_features_test = X_test[:, np.array(features) == 1]
    y_pred = [clf.predict(X, tree) for X in selected_features_test]

    accuracy = sum(y_pred == y_test) / len(y_test)
    # print(selected_features) # for debugging purpose
    return accuracy


# PSO for feature Selection
def PSO_for_feature_selection(fitness_func, num_particles, num_features, num_iterations):

    # Initialize the positions and velocities of the particles
    # generates a 2D NumPy array of size (num_particles, num_features),
    #  where each element is randomly sampled from the integers [0, 2)
    positions = np.random.randint(2, size=(num_particles, num_features))
    velocities = np.random.randint(2, size=(num_particles, num_features))

    # Initialize the personal best positions and fitness values
    personal_best_positions = positions.copy()
    personal_best_fitness_values = np.apply_along_axis(fitness_func, 1, personal_best_positions)

    # Initialize the global best position and fitness value
    global_best_index = np.argmax(personal_best_fitness_values)
    global_best_position = personal_best_positions[global_best_index]
    global_best_fitness_value = personal_best_fitness_values[global_best_index]

    # Set the PSO parameters
    inertia_weight = 0.7
    cognitive_component = 1.4
    social_component = 1.4

    # Start the PSO iterations
    for i in range(num_iterations):
        # Update the velocities of the particles
        new_velocities = (
            inertia_weight * velocities
            + cognitive_component * np.random.rand(num_particles, num_features) * (personal_best_positions - positions)
            + social_component * np.random.rand(num_particles, num_features) * (global_best_position - positions)
        )

        # Update the positions based on the velocities
        new_positions = np.where(new_velocities > 0.5, 1, 0)

        # Evaluate the fitness values of the new positions
        new_fitness_values = np.apply_along_axis(fitness_func, 1, new_positions)

        # Update the personal best positions and fitness values
        mask = new_fitness_values > personal_best_fitness_values
        personal_best_positions[mask] = new_positions[mask]
        personal_best_fitness_values[mask] = new_fitness_values[mask]

        # Update the global best position and fitness value
        index = np.argmax(personal_best_fitness_values)
        if personal_best_fitness_values[index] > global_best_fitness_value:
            global_best_index = index
            global_best_position = personal_best_positions[index]
            global_best_fitness_value = personal_best_fitness_values[index]

        # Update the velocities and positions
        velocities = new_velocities.copy()
        positions = new_positions.copy()

        # Print the current iteration's information
        print(f"Iteration {i + 1}:")
        print("Personal Best: ", personal_best_positions)
        print("Best Feature Subset:", global_best_position)
        print("Best Fitness Value:", global_best_fitness_value)
        print("--------------------------------------------")

    return global_best_position



# Test the PSO function for feature selection
num_particles = 10
num_features = X.shape[1]
num_iterations = 20

print("<================= Decision Tree with FS Optimization ===========================>")
best_feature_subset = PSO_for_feature_selection(fitness_func, num_particles, num_features, num_iterations)
print("Best Feature Subset:", best_feature_subset)
print("\n\n\n")








# <================================= Decision Tree with HP Optimization using PSO ======================================>

# Define the fitness function for DT without FS and HP optimization
def fitness_function_with_hp(params):
    max_depth = int(params[0])
    min_samples_split = int(params[1])
    min_samples_leaf = int(params[2])
    max_features = int(params[3])

    if max_depth < 0:
        max_depth = max_depth * -1
    if min_samples_split < 0:
        min_samples_split = min_samples_split * -1
    if min_samples_leaf < 0:
        min_samples_leaf = min_samples_leaf * -1
    if max_features < 0:
        max_features = max_features * -1

    clf = DecisionTree(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features
    )
    tree = clf.fit(X_train, y_train)

    y_pred = [clf.predict(X, tree) for X in X_test]

    accuracy = sum(y_pred == y_test) / len(y_test)
    return accuracy

# Define the PSO function
def PSO_with_HP(fitness_function_with_hp, num_particles, num_dimensions, num_iterations, param_ranges):
    # Initialize the positions and velocities of the particles
    positions = np.random.uniform(*zip(*param_ranges), (num_particles, num_dimensions))
    velocities = np.random.uniform(-1, 1, (num_particles, num_dimensions))

    # Initialize the personal best positions and fitness values
    personal_best_positions = positions.copy()
    personal_best_fitness_values = np.apply_along_axis(fitness_function_with_hp, 1, personal_best_positions)

    # Initialize the global best position and fitness value
    global_best_index = np.argmax(personal_best_fitness_values)
    global_best_position = personal_best_positions[global_best_index]
    global_best_fitness_value = personal_best_fitness_values[global_best_index]

    # Set the PSO parameters
    inertia_weight = 0.7
    cognitive_component = 1.4
    social_component = 1.4

    # Start the PSO iterations
    for i in range(num_iterations):
        # Update the velocities and positions of the particles
        new_velocities = (
            inertia_weight * velocities
            + cognitive_component
            * np.random.rand(num_particles, num_dimensions)
            * (personal_best_positions - positions)
            + social_component
            * np.random.rand(num_particles, num_dimensions)
            * (global_best_position - positions)
        )
        new_positions = positions + new_velocities

        # Evaluate the fitness values of the new positions
        new_fitness_values = np.apply_along_axis(fitness_function_with_hp, 1, new_positions)

        # Update the personal best positions and fitness values
        mask = new_fitness_values > personal_best_fitness_values
        personal_best_positions[mask] = new_positions[mask]
        personal_best_fitness_values[mask] = new_fitness_values[mask]

        # Update the global best position and fitness value
        index = np.argmax(personal_best_fitness_values)
        if personal_best_fitness_values[index] > global_best_fitness_value:
            global_best_position = personal_best_positions[index]
            global_best_fitness_value = personal_best_fitness_values[index]

        # Update the velocities and positions
        velocities = new_velocities.copy()
        positions = new_positions.copy()

        # Print the current iteration's information
        print(f"Iteration {i + 1}:")
        print("Best Global Position:", global_best_position)
        print("Best Fitness Value:", global_best_fitness_value)
        print("--------------------------------------------")

    return global_best_position, global_best_fitness_value



# Define the parameter ranges
param_ranges = [(1, 10), (2, 20), (1, 10), (1, 4)]

num_particles = 10
num_dimensions = 4
num_iterations = 10

print("<================= Decision Tree with HP Optimization ===========================>")
best_hyperparameters, best_fitness_value = PSO_with_HP(fitness_function_with_hp, num_particles, num_dimensions, num_iterations, param_ranges)

print("Best Hyperparameters:")
print("max_depth:", int(best_hyperparameters[0]))
print("min_samples_split:", int(best_hyperparameters[1]))
print("min_samples_leaf:", int(best_hyperparameters[2]))
print("max_features:", int(best_hyperparameters[3]))
print("Best Fitness Value:", best_fitness_value)
print("\n\n")



# <============================== Decision Tree FS and HP Optimization ================================>

# Define the fitness function for DT with FS and HP optimization
def fitness_function_with_fs_and_hp(params, features = best_feature_subset):
    max_depth = int(params[0])
    min_samples_split = int(params[1])
    min_samples_leaf = int(params[2])
    max_features = int(params[3])

    # Select the subset of features
    selected_features = X_train[:, np.array(features) == 1]
    print(selected_features, best_feature_subset)

    if max_depth < 0:
        max_depth = max_depth * -1
    if min_samples_split < 0:
        min_samples_split = min_samples_split * -1
    if min_samples_leaf < 0:
        min_samples_leaf = min_samples_leaf * -1
    if max_features < 0:
        max_features = max_features * -1

    clf = DecisionTree(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features
    )
    tree = clf.fit(selected_features, y_train)

    selected_features_test = X_test[:, np.array(features) == 1]
    y_pred = [clf.predict(X, tree) for X in selected_features_test]

    accuracy = sum(y_pred == y_test) / len(y_test)
    return accuracy


# Define the PSO function
def PSO_with_FS_and_HP(fitness_function_with_fs_and_hp, num_particles, num_dimensions, num_iterations, param_ranges):
    # Initialize the positions and velocities of the particles
    positions = np.random.uniform(*zip(*param_ranges), (num_particles, num_dimensions))
    velocities = np.random.uniform(-1, 1, (num_particles, num_dimensions))

    # Initialize the personal best positions and fitness values
    personal_best_positions = positions.copy()
    personal_best_fitness_values = np.apply_along_axis(fitness_function_with_fs_and_hp, 1, personal_best_positions)

    # Initialize the global best position and fitness value
    global_best_index = np.argmax(personal_best_fitness_values)
    global_best_position = personal_best_positions[global_best_index]
    global_best_fitness_value = personal_best_fitness_values[global_best_index]

    # Set the PSO parameters
    inertia_weight = 0.7
    cognitive_component = 1.4
    social_component = 1.4

    # Start the PSO iterations
    for i in range(num_iterations):
        # Update the velocities and positions of the particles
        new_velocities = (
            inertia_weight * velocities
            + cognitive_component
            * np.random.rand(num_particles, num_dimensions)
            * (personal_best_positions - positions)
            + social_component
            * np.random.rand(num_particles, num_dimensions)
            * (global_best_position - positions)
        )
        new_positions = positions + new_velocities

        # Evaluate the fitness values of the new positions
        new_fitness_values = np.apply_along_axis(fitness_function_with_fs_and_hp, 1, new_positions)

        # Update the personal best positions and fitness values
        mask = new_fitness_values > personal_best_fitness_values
        personal_best_positions[mask] = new_positions[mask]
        personal_best_fitness_values[mask] = new_fitness_values[mask]

        # Update the global best position and fitness value
        index = np.argmax(personal_best_fitness_values)
        if personal_best_fitness_values[index] > global_best_fitness_value:
            global_best_position = personal_best_positions[index]
            global_best_fitness_value = personal_best_fitness_values[index]

        # Update the velocities and positions
        velocities = new_velocities.copy()
        positions = new_positions.copy()

        # Print the current iteration's information
        print(f"Iteration {i + 1}:")
        print("Best Global Position:", global_best_position)
        print("Best Fitness Value:", global_best_fitness_value)
        print("--------------------------------------------")

    return global_best_position, global_best_fitness_value


# Define the parameter ranges
param_ranges = [(1, 10), (2, 20), (1, 10), (1, 4)]

num_particles = 10
num_dimensions = 4
num_iterations = 10

print("<================= Decision Tree with HP and with FS Optimization ===========================>")
best_hyperparameters, best_fitness_value = PSO_with_FS_and_HP(fitness_function_with_fs_and_hp, num_particles, num_dimensions, num_iterations, param_ranges)

print("Best Hyperparameters:")
print("max_depth:", int(best_hyperparameters[0]))
print("min_samples_split:", int(best_hyperparameters[1]))
print("min_samples_leaf:", int(best_hyperparameters[2]))
print("max_features:", int(best_hyperparameters[3]))
print("Best Fitness Value:", best_fitness_value)
print("\n\n")










Accuracy without HP and without FS:  0.9298245614035088 


Iteration 1:
Personal Best:  [[0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1 1 1]
 [0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 0]
 [0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0]
 [1 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1]
 [1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 1]
 [1 1 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 0 0 1]
 [0 1 1 1 0 0 1 0 1 1 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0]
 [1 1 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 1]
 [1 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1]
 [0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 1 0 0]]
Best Feature Subset: [1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 1]
Best Fitness Value: 0.9649122807017544
--------------------------------------------
Iteration 2:
Personal Best:  [[0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1 1 1]
 [0 1 0 0 1 1 0 0 1 0 