In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import time

# Load your dataset from Google Drive
from google.colab import drive
drive.mount('/content/drive')

train_path = '/content/drive/My Drive/wireless/NF-BOT-IOT_train_preprocessed.csv'
test_path = '/content/drive/My Drive/wireless/NF-BOT-IOT_test_preprocessed.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
train_data = train_data.dropna(subset=['label'])
test_data = test_data.dropna(subset=['label'])

X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Define the Crow Search Algorithm class
class CrowSearchAlgorithm:
    def __init__(self, population_size=5, max_iter=3, fl=0.5, ap=0.1):
        self.population_size = population_size
        self.max_iter = max_iter
        self.fl = fl  # Flight length
        self.ap = ap  # Awareness probability

    def fit(self, X, y):
        start_time = time.time()
        num_features = X.shape[1]
        positions = np.random.randint(0, 2, size=(self.population_size, num_features), dtype=bool)
        fitness_memory = np.zeros(self.population_size)

        for i in range(self.population_size):
            fitness_memory[i] = self.fitness(X.iloc[:, positions[i]], y)

        best_idx = np.argmax(fitness_memory)
        best_features = positions[best_idx, :]

        for iteration in range(self.max_iter):
            for i in range(self.population_size):
                new_pos = positions[i].copy()
                if np.random.rand() < self.ap:
                    new_pos ^= (np.random.rand(num_features) < self.fl)
                else:
                    new_pos ^= (positions[best_idx] ^ (np.random.rand(num_features) < self.fl))

                new_fit = self.fitness(X.iloc[:, new_pos], y)
                if new_fit > fitness_memory[i]:
                    positions[i] = new_pos
                    fitness_memory[i] = new_fit
                    if new_fit > fitness_memory[best_idx]:
                        best_idx = i

        execution_time = time.time() - start_time
        return positions[best_idx], execution_time

    def fitness(self, X_subset, y):
        if X_subset.empty:
            return 0
        clf = make_pipeline(StandardScaler(), LogisticRegression(solver='lbfgs', max_iter=1000))
        clf.fit(X_subset, y)
        return accuracy_score(y, clf.predict(X_subset))

# Instantiate and run the CSA for feature selection
csa = CrowSearchAlgorithm()
best_features_mask, exec_time = csa.fit(X_train, y_train)
best_features = X_train.columns[best_features_mask]

# Print and save selected features
print("Selected features by CSA:", best_features.tolist())
filename = "NF-BOT-IOT_CSA_features.csv"
with open(filename, 'w') as file:
    file.write(f"Algorithm,Execution Time,Number of Features,Feature Names\n")
    file.write(f"CSA,{exec_time},{len(best_features)},{' '.join(best_features)}\n")

# Filter the training and testing data
X_train_selected = X_train.loc[:, best_features]
X_test_selected = X_test.loc[:, best_features]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Selected features by CSA: ['L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS']
