In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/drive/MyDrive/Wireless dataset/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')

X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']

# Split a small portion of training data for optimization (HLO)
X_train_opt, _, y_train_opt, _ = train_test_split(X_train, y_train, train_size=0.01, random_state=7)

# Define custom Human Learning Optimization (HLO) algorithm for feature selection
class HLOFeatureSelection:
    def __init__(self, num_iterations=5, num_candidates=10, num_top_candidates=5):
        self.num_iterations = num_iterations
        self.num_candidates = num_candidates
        self.num_top_candidates = num_top_candidates

    def fit(self, X, y):
        num_features = X.shape[1]
        best_feature_set = None
        best_accuracy = 0.0

        for _ in range(self.num_iterations):
            candidate_feature_sets = self._generate_candidates(num_features)
            top_candidate_sets = self._select_top_candidates(candidate_feature_sets, X, y)

            for candidate_set in top_candidate_sets:
                clf = RandomForestClassifier()
                X_subset = X.iloc[:, candidate_set]
                clf.fit(X_subset, y)
                accuracy = accuracy_score(y, clf.predict(X_subset))

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_feature_set = candidate_set

        self.selected_features = best_feature_set
        return self

    def _generate_candidates(self, num_features):
        candidate_sets = []
        for _ in range(self.num_candidates):
            candidate_set = np.random.choice(num_features, np.random.randint(1, num_features + 1), replace=False)
            candidate_sets.append(candidate_set)
        return candidate_sets

    def _select_top_candidates(self, candidate_sets, X, y):
        candidate_scores = []
        for candidate_set in candidate_sets:
            clf = RandomForestClassifier()
            X_subset = X.iloc[:, candidate_set]
            clf.fit(X_subset, y)
            accuracy = accuracy_score(y, clf.predict(X_subset))
            candidate_scores.append((candidate_set, accuracy))
        candidate_scores.sort(key=lambda x: x[1], reverse=True)
        return [candidate_set for candidate_set, _ in candidate_scores[:self.num_top_candidates]]

# Instantiate and run the HLO algorithm for feature selection
hlo = HLOFeatureSelection(num_iterations=5, num_candidates=10, num_top_candidates=5)
hlo.fit(X_train_opt, y_train_opt)

selected_feature_indices = hlo.selected_features
selected_features = X_train_opt.columns[selected_feature_indices]
num_selected_features = len(selected_features)

# Save selected features to a CSV file
feature_name = filename + "_HLO_feature.csv"
pd.Series(selected_features).to_csv(feature_name, index=False, header=True)

# Print number of selected features and list them
print("Number of selected features:", num_selected_features)
print("Selected Features:", selected_features.tolist())  # Convert Index to list for easier viewing

# Use selected features to filter columns in X_train
X_train_selected = X_train[selected_features]

# Output the dataframe with selected features
X_train_selected.head()


Number of selected features: 9
Selected Features: ['PROTOCOL', 'OUT_BYTES', 'IN_BYTES', 'OUT_PKTS', 'L4_DST_PORT', 'L4_SRC_PORT', 'IN_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']


Unnamed: 0,PROTOCOL,OUT_BYTES,IN_BYTES,OUT_PKTS,L4_DST_PORT,L4_SRC_PORT,IN_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS
0,-0.23334,-0.00795,-0.01864,0.001374,-0.331298,0.165107,-0.022356,0.629846,0.495722
1,-0.23334,-0.008504,-0.019469,-0.024643,-0.565481,-3.856688,-0.04665,-0.235788,0.495841
2,-0.23334,-0.00795,-0.01864,0.001374,-0.331298,0.194481,-0.022356,0.629846,0.49572
3,4.057829,-0.008553,-0.019494,-0.029847,2.671561,0.306015,-0.04665,-2.709027,-2.083921
4,-0.23334,-0.00795,-0.01864,0.001374,-0.331298,0.151803,-0.022356,0.629846,0.495721
