In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import time

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/drive/MyDrive/CS548 Wireless Project/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')
test_data = pd.read_csv('/content/drive/MyDrive/CS548 Wireless Project/NF-BOT-IOT_test_preprocessed.csv', sep=',', encoding='utf-8')

X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Define custom Pathfinder algorithm for feature selection
class PathfinderAlgorithm:
    def __init__(self, num_pathfinders=10, max_iter=5):
        self.num_pathfinders = num_pathfinders
        self.max_iter = max_iter

    def fit(self, X, y):
        start_time = time.time()  # Start timing the algorithm
        num_features = X.shape[1]
        best_feature_set = None
        best_accuracy = 0.0

        # Initialize random positions of pathfinders
        positions = np.random.randint(0, 2, size=(self.num_pathfinders, num_features), dtype=bool)

        for iteration in range(self.max_iter):
            # Evaluate the fitness of each pathfinder's solution
            fitness_values = np.zeros(self.num_pathfinders)
            for i in range(self.num_pathfinders):
                selected_features = np.where(positions[i])[0]
                if len(selected_features) == 0:
                    continue  # Skip if no features are selected
                clf = LogisticRegression(solver='liblinear')
                X_subset = X.iloc[:, selected_features]
                clf.fit(X_subset, y)
                accuracy = accuracy_score(y, clf.predict(X_subset))
                fitness_values[i] = accuracy

                # Update best solution found
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_feature_set = selected_features

            # Update positions of pathfinders based on fitness
            sorted_indices = np.argsort(fitness_values)[::-1]
            for i in range(self.num_pathfinders):
                if i < self.num_pathfinders // 2:
                    positions[i] = positions[sorted_indices[i]]
                else:
                    positions[i] = np.random.randint(0, 2, size=num_features, dtype=bool)

        end_time = time.time()  # End timing the algorithm
        self.exec_time = end_time - start_time  # Calculate execution time
        self.selected_features = best_feature_set
        return self

# Instantiate and run the Pathfinder Algorithm for feature selection
pathfinder = PathfinderAlgorithm()
pathfinder.fit(X_train, y_train)

# Apply selected features to training data
selected_feature_indices = pathfinder.selected_features
selected_features = X_train.columns[selected_feature_indices]
num_selected_features = len(selected_features)

# Save selected features to a CSV file
feature_name = filename + "_Pathfinder_features.csv"
optimizer_name = "Pathfinder Algorithm"
execution_time = pathfinder.exec_time

# Write CSV file with optimizer name, execution time, number of selected features, and their names
with open(feature_name, 'w') as file:
    file.write(f"optimizer,execution time,num of selected features,selected features\n")
    file.write(f"{optimizer_name},{execution_time},{num_selected_features},\"")
    file.write(",".join(selected_features))
    file.write("\"\n")

# Print number of selected features and their names
print("Number of selected features:", num_selected_features)
print("Selected features:", selected_features.tolist())

# Filter both training and testing data with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


Number of selected features: 6
Selected features: ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_PKTS', 'TCP_FLAGS']
