In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/drive/MyDrive/dataset/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')
test_data = pd.read_csv('/content/drive/MyDrive/dataset/NF-BOT-IOT_test_preprocessed.csv', sep=',', encoding='utf-8')

# Prepare training data
X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']

# Prepare test data
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Split a small portion of training data for optimization (Symbiotic Organisms Search)
X_t, _, y_t, _ = train_test_split(X_train, y_train, train_size=0.01, random_state=7)
X_test_t, _, y_test_t, _ = train_test_split(X_test, y_test, train_size=0.01, random_state=7)

# Define custom Symbiotic Organisms Search algorithm for feature selection
class SymbioticOrganismsSearch:
    def __init__(self, num_iterations=10):
        self.num_iterations = num_iterations

    def fit(self, X, y):
        start_time = time.time()  # Start the timer
        num_features = X.shape[1]
        mutualism_gain = np.ones(num_features)  # Initialize mutualism gain
        best_feature_set = None
        best_accuracy = 0.0

        for iteration in range(self.num_iterations):
            # Generate a random feature subset based on mutualism gain
            feature_mask = (np.random.rand(num_features) < mutualism_gain)
            selected_features = np.where(feature_mask)[0]

            # Evaluate subset using a simple model (e.g., Random Forest)
            clf = RandomForestClassifier()
            X_subset = X.iloc[:, selected_features]
            clf.fit(X_subset, y)
            accuracy = accuracy_score(y, clf.predict(X_subset))

            # Update mutualism gain based on accuracy
            mutualism_gain[selected_features] += accuracy / num_features

            # Update best solution found
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature_set = selected_features

        end_time = time.time()  # End the timer
        execution_time = end_time - start_time  # Calculate execution time

        self.selected_features = best_feature_set
        self.execution_time = execution_time
        self.num_selected_features = len(best_feature_set) if best_feature_set is not None else 0  # Number of selected features
        return self

# Instantiate and run the Symbiotic Organisms Search algorithm for feature selection on training data
symbiotic_search = SymbioticOrganismsSearch(num_iterations=10)
symbiotic_search.fit(X_t, y_t)

# Apply selected features to training data
selected_features_train = X_train.columns[symbiotic_search.selected_features]

# Print number of selected features and list them for the training data
num_selected_features_train = len(selected_features_train)
print("Number of selected features (training data):", num_selected_features_train)
print("Selected Features (training data):", selected_features_train.tolist())

# Use selected features to filter columns in X_train
X_train_selected = X_train[selected_features_train]

# Output the dataframe with selected features for the training data
print("DataFrame with selected features for training data:")
print(X_train_selected.head())

# Apply selected features to test data
selected_features_test = X_test.columns[symbiotic_search.selected_features]

# Print number of selected features and list them for the test data
num_selected_features_test = len(selected_features_test)
print("Number of selected features (test data):", num_selected_features_test)
print("Selected Features (test data):", selected_features_test.tolist())

# Use selected features to filter columns in X_test
X_test_selected = X_test[selected_features_test]

# Output the dataframe with selected features for the test data
print("DataFrame with selected features for test data:")
print(X_test_selected.head())

# Create a DataFrame for optimization results
optimization_results = pd.DataFrame({
    "Optimization": ["SymbioticOrganismsSearch"],
    "Execution Time of Optimizer": [symbiotic_search.execution_time],
    "No of Feature Selected": [symbiotic_search.num_selected_features],
    "Selected Feature": [', '.join(selected_features_train.tolist())]
})

# Save optimization results to a CSV file
optimization_results.to_csv('NF-BOT-IOT_SymbioticOrganismsSearch_feature.csv', index=False)


Number of selected features (training data): 10
Selected Features (training data): ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
DataFrame with selected features for training data:
   L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO  IN_BYTES  OUT_BYTES  \
0     0.165107    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   
1    -3.856688    -0.565481 -0.233340  7.100761 -0.019469  -0.008504   
2     0.194481    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   
3     0.306015     2.671561  4.057829 -0.252892 -0.019494  -0.008553   
4     0.151803    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   

    IN_PKTS  OUT_PKTS  TCP_FLAGS  FLOW_DURATION_MILLISECONDS  
0 -0.022356  0.001374   0.629846                    0.495722  
1 -0.046650 -0.024643  -0.235788                    0.495841  
2 -0.022356  0.001374   0.629846                    0.495720  
3 -0.046650 -0.029847  -2.709027  

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

# Load your dataset
filename = "NF-BOT-IOT"
train_data = pd.read_csv('/content/drive/MyDrive/dataset/NF-BOT-IOT_train_preprocessed.csv', sep=',', encoding='utf-8')
test_data = pd.read_csv('/content/drive/MyDrive/dataset/NF-BOT-IOT_test_preprocessed.csv', sep=',', encoding='utf-8')

# Prepare training data
X_train = train_data.drop(columns=['label'], axis=1)
y_train = train_data['label']

# Prepare test data
X_test = test_data.drop(columns=['label'], axis=1)
y_test = test_data['label']

# Split a small portion of training data for optimization (Tree Growth Algorithm)
X_t, _, y_t, _ = train_test_split(X_train, y_train, train_size=0.01, random_state=7)
X_test_t, _, y_test_t, _ = train_test_split(X_test, y_test, train_size=0.01, random_state=7)

# Define custom Tree Growth Algorithm for feature selection
class TreeGrowthAlgorithm:
    def __init__(self, max_depth=5, num_trees=100):
        self.max_depth = max_depth
        self.num_trees = num_trees

    def fit(self, X, y):
        start_time = time.time()  # Start the timer
        num_features = X.shape[1]
        feature_importance = np.zeros(num_features)  # Initialize feature importance
        best_feature_set = None
        best_accuracy = 0.0

        for tree in range(self.num_trees):
            # Grow a random forest with limited depth
            clf = RandomForestClassifier(max_depth=self.max_depth)
            clf.fit(X, y)

            # Accumulate feature importance across all trees
            feature_importance += clf.feature_importances_

            # Update best solution found
            if max(feature_importance) > best_accuracy:
                best_accuracy = max(feature_importance)
                best_feature_set = np.where(feature_importance > 0)[0]

        end_time = time.time()  # End the timer
        execution_time = end_time - start_time  # Calculate execution time

        self.selected_features = best_feature_set
        self.execution_time = execution_time
        self.num_selected_features = len(best_feature_set) if best_feature_set is not None else 0  # Number of selected features
        return self

# Instantiate and run the Tree Growth Algorithm for feature selection on training data
tree_grower = TreeGrowthAlgorithm(max_depth=5, num_trees=100)
tree_grower.fit(X_t, y_t)

# Apply selected features to training data
selected_features_train = X_train.columns[tree_grower.selected_features]

# Print number of selected features and list them for the training data
num_selected_features_train = len(selected_features_train)
print("Number of selected features (training data):", num_selected_features_train)
print("Selected Features (training data):", selected_features_train.tolist())

# Use selected features to filter columns in X_train
X_train_selected = X_train[selected_features_train]

# Output the dataframe with selected features for the training data
print("DataFrame with selected features for training data:")
print(X_train_selected.head())

# Apply selected features to test data
selected_features_test = X_test.columns[tree_grower.selected_features]

# Print number of selected features and list them for the test data
num_selected_features_test = len(selected_features_test)
print("Number of selected features (test data):", num_selected_features_test)
print("Selected Features (test data):", selected_features_test.tolist())

# Use selected features to filter columns in X_test
X_test_selected = X_test[selected_features_test]

# Output the dataframe with selected features for the test data
print("DataFrame with selected features for test data:")
print(X_test_selected.head())

# Create a DataFrame for optimization results
optimization_results = pd.DataFrame({
    "Optimization": ["TreeGrowthAlgorithm"],
    "Execution Time of Optimizer": [tree_grower.execution_time],
    "No of Feature Selected": [tree_grower.num_selected_features],
    "Selected Feature": [', '.join(selected_features_train.tolist())]
})

# Save optimization results to a CSV file
optimization_results.to_csv('NF-BOT-IOT_TreeGrowthAlgorithm_feature.csv', index=False)


Number of selected features (training data): 10
Selected Features (training data): ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
DataFrame with selected features for training data:
   L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO  IN_BYTES  OUT_BYTES  \
0     0.165107    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   
1    -3.856688    -0.565481 -0.233340  7.100761 -0.019469  -0.008504   
2     0.194481    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   
3     0.306015     2.671561  4.057829 -0.252892 -0.019494  -0.008553   
4     0.151803    -0.331298 -0.233340 -0.252892 -0.018640  -0.007950   

    IN_PKTS  OUT_PKTS  TCP_FLAGS  FLOW_DURATION_MILLISECONDS  
0 -0.022356  0.001374   0.629846                    0.495722  
1 -0.046650 -0.024643  -0.235788                    0.495841  
2 -0.022356  0.001374   0.629846                    0.495720  
3 -0.046650 -0.029847  -2.709027  