In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/sample_data/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')
test_data = pd.read_csv('/content/sample_data/NF-BOT-IOT_test_preprocessed.csv', sep=',', encoding='utf-8')

X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Define custom Bat Algorithm (BA) algorithm for feature selection
class BAFeatureSelection:
    def __init__(self, num_bats=2, max_iter=5, loudness=0.5, pulse_rate=0.5, alpha=0.5, gamma=0.5):
        self.num_bats = num_bats
        self.max_iter = max_iter
        self.loudness = loudness
        self.pulse_rate = pulse_rate
        self.alpha = alpha
        self.gamma = gamma

    def fit(self, X, y):
        start_time = time.time()  # Start timing the algorithm
        num_features = X.shape[1]
        best_feature_set = None
        best_accuracy = 0.0

        # Initialize bat positions and velocities
        bat_positions = np.random.randint(0, 2, size=(self.num_bats, num_features), dtype=bool)
        bat_velocities = np.zeros((self.num_bats, num_features), dtype=float)

        for iteration in range(self.max_iter):
            for i in range(self.num_bats):
                # Generate a new solution using echolocation
                new_position = bat_positions[i] + bat_velocities[i]
                new_position = np.clip(new_position, 0, 1)  # Ensure binary values

                # Generate a new solution with random walk
                if np.random.rand() > self.pulse_rate:
                    new_position = np.random.randint(0, 2, size=num_features, dtype=bool)

                # Evaluate the fitness of the new solution
                selected_features = np.where(new_position)[0]
                if len(selected_features) == 0:
                    continue  # Skip if no features are selected
                clf = LogisticRegression(solver='liblinear')
                X_subset = X.iloc[:, selected_features]
                clf.fit(X_subset, y)
                accuracy = accuracy_score(y, clf.predict(X_subset))

                # Update best solution found
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_feature_set = new_position

                # Update bat position and velocity
                bat_positions[i] = new_position
                bat_velocities[i] += self.alpha * np.logical_xor(best_feature_set, bat_positions[i])
                bat_velocities[i] += self.gamma * (np.random.rand(num_features) - 0.5)

        end_time = time.time()  # End timing the algorithm
        self.exec_time = end_time - start_time  # Calculate execution time
        self.selected_features = np.where(best_feature_set)[0]
        return self

# Instantiate and run the BA algorithm for feature selection
ba = BAFeatureSelection(num_bats=2, max_iter=5)
ba.fit(X_train, y_train)

# Apply selected features to training data
selected_feature_indices = ba.selected_features
selected_features = X_train.columns[selected_feature_indices]
num_selected_features = len(selected_features)

# Save selected features to a CSV file
feature_name = filename + "_BA_features.csv"
optimizer_name = "BA"
execution_time = ba.exec_time

# Write CSV file with optimizer name, execution time, number of selected features, and their names
with open(feature_name, 'w') as file:
    file.write(f"optimizer,execution time,num of selected features,selected features\n")
    file.write(f"{optimizer_name},{execution_time},{num_selected_features},\"")
    file.write(",".join(selected_features))
    file.write("\"\n")

# Print number of selected features and their names
print("Number of selected features:", num_selected_features)
print("Selected features:", selected_features.tolist())

# Filter both training and testing data with selected features
X_train_selected = X_train[selected_features]
X_train_selected
X_test_selected = X_test[selected_features]
X_test_selected



Number of selected features: 8
Selected features: ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS']




Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS
0,-1.114900,-0.559883,-0.23334,-0.036608,-0.008553,-0.046650,-0.029847,-0.235788
1,1.108007,0.110072,-0.23334,-0.252892,-0.008504,-0.046650,-0.024643,0.011536
2,-0.772350,-0.476342,-0.23334,-0.252892,-0.008504,-0.046650,-0.024643,0.011536
3,1.185416,-0.559883,-0.23334,-0.036608,-0.006928,-0.030454,-0.009033,0.629846
4,0.006402,1.059842,-0.23334,-0.252892,-0.008504,-0.046650,-0.024643,0.011536
...,...,...,...,...,...,...,...,...
178608,1.493495,-0.421145,-0.23334,-0.252892,-0.008553,-0.046650,-0.029847,-2.461703
178609,-0.512046,0.029649,-0.23334,-0.252892,-0.008504,-0.046650,-0.024643,0.011536
178610,1.023946,-0.352909,-0.23334,-0.252892,-0.008553,-0.046650,-0.029847,-2.461703
178611,0.083552,-0.147423,-0.23334,2.497003,-0.008504,-0.046650,-0.024643,0.011536
