In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

# Define custom Henry Gas Solubility Optimization algorithm for feature selection
class HenryGasSolubilityOptimization:
    def __init__(self, num_agents=1, max_iter=2):
        self.num_agents = num_agents
        self.max_iter = max_iter

    def fit(self, X, y):
        start_time = time.time()  # Start the timer
        num_features = X.shape[1]
        best_feature_set = np.random.choice([True, False], size=num_features)  # Randomly initialize the feature selection
        best_accuracy = self._evaluate(X, y, best_feature_set)

        for iteration in range(self.max_iter):
            # Generate new candidate solutions
            candidate_feature_sets = [np.random.choice([True, False], size=num_features) for _ in range(self.num_agents)]

            # Evaluate candidate solutions
            for candidate in candidate_feature_sets:
                accuracy = self._evaluate(X, y, candidate)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_feature_set = candidate

        end_time = time.time()  # End the timer
        execution_time = end_time - start_time   # Calculate execution time

        self.selected_features = best_feature_set
        self.execution_time = execution_time
        self.num_selected_features = np.sum(best_feature_set)  # Number of selected features
        return self

    def _evaluate(self, X, y, selected_features):
        clf = RandomForestClassifier()
        X_subset = X.iloc[:, selected_features]
        clf.fit(X_subset, y)
        accuracy = accuracy_score(y, clf.predict(X_subset))
        return accuracy

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/drive/MyDrive/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')
test_data = pd.read_csv('/content/drive/MyDrive/NF-BOT-IOT_test_preprocessed.csv', sep=',', encoding='utf-8')

# Prepare training data
X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']

# Prepare test data
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Instantiate and run the Henry Gas Solubility Optimization algorithm for feature selection on training data
henry_gas_optimization = HenryGasSolubilityOptimization(num_agents=1, max_iter=2)
henry_gas_optimization.fit(X_train, y_train)

# Apply selected features to training data
selected_features_train = X_train.columns[henry_gas_optimization.selected_features]

# Print number of selected features and list them for the training data
num_selected_features_train = henry_gas_optimization.num_selected_features
print("Number of selected features (training data):", num_selected_features_train)
print("Selected Features (training data):", selected_features_train.tolist())

# Use selected features to filter columns in X_train
X_train_selected = X_train[selected_features_train]

# Output the dataframe with selected features for the training data
print("DataFrame with selected features for training data:")
print(X_train_selected.head())

# Apply selected features to test data
selected_features_test = X_test.columns[henry_gas_optimization.selected_features]

# Print number of selected features and list them for the test data
num_selected_features_test = num_selected_features_train  # Same as training data for test data
print("Number of selected features (test data):", num_selected_features_test)
print("Selected Features (test data):", selected_features_test.tolist())

# Use selected features to filter columns in X_test
X_test_selected = X_test[selected_features_test]

# Output the dataframe with selected features for the test data
print("DataFrame with selected features for test data:")
print(X_test_selected.head())

# Create a DataFrame for optimization results
optimization_results = pd.DataFrame({
    "Optimization": ["HenryGasSolubilityOptimization"],
    "Execution Time of Optimizer": [henry_gas_optimization.execution_time],
    "No of Feature Selected": [num_selected_features_train],
    "Selected Feature": [', '.join(selected_features_train.tolist())]
})

# Save optimization results to a CSV file
optimization_results.to_csv('NF-BOT-IOT_HenryGasSolubilityOptimization_feature.csv', index=False)


Number of selected features (training data): 6
Selected Features (training data): ['L4_SRC_PORT', 'L4_DST_PORT', 'OUT_BYTES', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
DataFrame with selected features for training data:
   L4_SRC_PORT  L4_DST_PORT  OUT_BYTES  OUT_PKTS  TCP_FLAGS  \
0     0.165107    -0.331298  -0.007950  0.001374   0.629846   
1    -3.856688    -0.565481  -0.008504 -0.024643  -0.235788   
2     0.194481    -0.331298  -0.007950  0.001374   0.629846   
3     0.306015     2.671561  -0.008553 -0.029847  -2.709027   
4     0.151803    -0.331298  -0.007950  0.001374   0.629846   

   FLOW_DURATION_MILLISECONDS  
0                    0.495722  
1                    0.495841  
2                    0.495720  
3                   -2.083921  
4                    0.495721  
Number of selected features (test data): 6
Selected Features (test data): ['L4_SRC_PORT', 'L4_DST_PORT', 'OUT_BYTES', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
DataFrame with selected