In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [None]:
import pandas as pd
import numpy as np
import warnings
import os
import time
import joblib
import matplotlib.pyplot as plt

from math import ceil

from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import KNNImputer

from sklearn.utils import resample

from multiprocessing import Pool, cpu_count

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from scipy.spatial.distance import cosine

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.metrics import classification_report, accuracy_score, make_scorer

from sklearn.exceptions import UndefinedMetricWarning
from sklearn.exceptions import ConvergenceWarning

from warnings import catch_warnings, filterwarnings

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# fetch dataset
rt_iot2022 = fetch_ucirepo(id=942)

# data (as pandas dataframes)
X = rt_iot2022.data.features
y = rt_iot2022.data.targets

In [None]:
# metadata
print(rt_iot2022.metadata)

{'uci_id': 942, 'name': 'RT-IoT2022 ', 'repository_url': 'https://archive.ics.uci.edu/dataset/942/rt-iot2022', 'data_url': 'https://archive.ics.uci.edu/static/public/942/data.csv', 'abstract': 'The RT-IoT2022, a proprietary dataset derived from a real-time IoT infrastructure, is introduced as a comprehensive resource integrating a diverse range of IoT devices and sophisticated network attack methodologies. This dataset encompasses both normal and adversarial network behaviours, providing a general representation of real-world scenarios.\nIncorporating data from IoT devices such as ThingSpeak-LED, Wipro-Bulb, and MQTT-Temp, as well as simulated attack scenarios involving Brute-Force SSH attacks, DDoS attacks using Hping and Slowloris, and Nmap patterns, RT-IoT2022 offers a detailed perspective on the complex nature of network traffic. The bidirectional attributes of network traffic are meticulously captured using the Zeek network monitoring tool and the Flowmeter plugin. Researchers can

In [None]:
# Set display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# variable information
print(rt_iot2022.variables)

integer_count = 0
categorical_count = 0
continuous_count = 0

for i in range(len(rt_iot2022.variables)):  # Corrected the loop to iterate over indices
    var = rt_iot2022.variables.iloc[i]  # Get the variable information at index i
    var_type = var['type']  # Extract the 'type' of the variable

    if var_type == 'Integer':
        integer_count += 1
    elif var_type == 'Categorical':
        categorical_count += 1
    elif var_type == 'Continuous':
        continuous_count += 1

print("Integer variables count:", integer_count)
print("Categorical variables count:", categorical_count)
print("Continuous variables count:", continuous_count)


                        name     role         type demographic description  \
0                  id.orig_p  Feature      Integer        None        None   
1                  id.resp_p  Feature      Integer        None        None   
2                      proto  Feature  Categorical        None        None   
3                    service  Feature   Continuous        None        None   
4              flow_duration  Feature   Continuous        None        None   
5               fwd_pkts_tot  Feature      Integer        None        None   
6               bwd_pkts_tot  Feature      Integer        None        None   
7          fwd_data_pkts_tot  Feature      Integer        None        None   
8          bwd_data_pkts_tot  Feature      Integer        None        None   
9           fwd_pkts_per_sec  Feature   Continuous        None        None   
10          bwd_pkts_per_sec  Feature   Continuous        None        None   
11         flow_pkts_per_sec  Feature   Continuous        None  

In [None]:
# variable information
print(rt_iot2022.variables.columns)
print(rt_iot2022.data.features.shape)

Index(['name', 'role', 'type', 'demographic', 'description', 'units',
       'missing_values'],
      dtype='object')
(123117, 83)


In [None]:
# Set display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Assuming rt_iot2022.variables is a Pandas DataFrame
print(rt_iot2022.variables)


                        name     role         type demographic description  \
0                  id.orig_p  Feature      Integer        None        None   
1                  id.resp_p  Feature      Integer        None        None   
2                      proto  Feature  Categorical        None        None   
3                    service  Feature   Continuous        None        None   
4              flow_duration  Feature   Continuous        None        None   
5               fwd_pkts_tot  Feature      Integer        None        None   
6               bwd_pkts_tot  Feature      Integer        None        None   
7          fwd_data_pkts_tot  Feature      Integer        None        None   
8          bwd_data_pkts_tot  Feature      Integer        None        None   
9           fwd_pkts_per_sec  Feature   Continuous        None        None   
10          bwd_pkts_per_sec  Feature   Continuous        None        None   
11         flow_pkts_per_sec  Feature   Continuous        None  

In [None]:
print(X.iloc[:, 3].unique())
print(y.iloc[:, 0].unique())

['mqtt' '-' 'http' 'dns' 'ntp' 'ssl' 'dhcp' 'irc' 'ssh' 'radius']
['MQTT_Publish' 'Thing_Speak' 'Wipro_bulb' 'ARP_poisioning'
 'DDOS_Slowloris' 'DOS_SYN_Hping' 'Metasploit_Brute_Force_SSH'
 'NMAP_FIN_SCAN' 'NMAP_OS_DETECTION' 'NMAP_TCP_scan' 'NMAP_UDP_SCAN'
 'NMAP_XMAS_TREE_SCAN']


In [None]:
number_dataset_name = []
categorical_dataset_name = []

for i in rt_iot2022.variables.iterrows():
    if i[1]['type'] == 'Categorical' or i[1]['name'] == 'service':
        categorical_dataset_name.append(i[1]['name'])
    else:
        number_dataset_name.append(i[1]['name'])

In [None]:
max_length = max(len(number_dataset_name), len(categorical_dataset_name))

for i in range(max_length + 1):
    print("number:", i, number_dataset_name[i] if i < len(number_dataset_name) else "-")
    print("categorical:", i, categorical_dataset_name[i] if i < len(categorical_dataset_name) else "-")


number: 0 id.orig_p
categorical: 0 proto
number: 1 id.resp_p
categorical: 1 service
number: 2 flow_duration
categorical: 2 Attack_type
number: 3 fwd_pkts_tot
categorical: 3 -
number: 4 bwd_pkts_tot
categorical: 4 -
number: 5 fwd_data_pkts_tot
categorical: 5 -
number: 6 bwd_data_pkts_tot
categorical: 6 -
number: 7 fwd_pkts_per_sec
categorical: 7 -
number: 8 bwd_pkts_per_sec
categorical: 8 -
number: 9 flow_pkts_per_sec
categorical: 9 -
number: 10 down_up_ratio
categorical: 10 -
number: 11 fwd_header_size_tot
categorical: 11 -
number: 12 fwd_header_size_min
categorical: 12 -
number: 13 fwd_header_size_max
categorical: 13 -
number: 14 bwd_header_size_tot
categorical: 14 -
number: 15 bwd_header_size_min
categorical: 15 -
number: 16 bwd_header_size_max
categorical: 16 -
number: 17 flow_FIN_flag_count
categorical: 17 -
number: 18 flow_SYN_flag_count
categorical: 18 -
number: 19 flow_RST_flag_count
categorical: 19 -
number: 20 fwd_PSH_flag_count
categorical: 20 -
number: 21 bwd_PSH_flag_count


In [None]:
X_number = pd.DataFrame()
X_categorical = pd.DataFrame()

for i in range(len(X.columns)):
    col_name = X.columns[i]
    if col_name in number_dataset_name:
        X_number[col_name] = X[col_name]
    elif col_name in categorical_dataset_name:
        X_categorical[col_name] = X[col_name]

In [None]:
# Set display options
pd.set_option('display.max_rows', 5)  # Show rows
pd.set_option('display.max_columns', None)  # Show columns

# Assuming X_continuous is your DataFrame
print(X_number)
print()
print(X_categorical)


        id.orig_p  id.resp_p  flow_duration  fwd_pkts_tot  bwd_pkts_tot  \
0           38667       1883      32.011598             9             5   
1           51143       1883      31.883584             9             5   
...           ...        ...            ...           ...           ...   
123115      59247      65000       0.000006             1             1   
123116      59247      65129       0.000006             1             1   

        fwd_data_pkts_tot  bwd_data_pkts_tot  fwd_pkts_per_sec  \
0                       3                  3          0.281148   
1                       3                  3          0.282277   
...                   ...                ...               ...   
123115                  0                  0     167772.160000   
123116                  0                  0     167772.160000   

        bwd_pkts_per_sec  flow_pkts_per_sec  down_up_ratio  \
0               0.156193           0.437341       0.555556   
1               0.156821    

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns using Z-score normalization
X_number_scaled = scaler.fit_transform(X_number)

# Convert the scaled array back to a DataFrame with the original column names
X_number_scaled_df = pd.DataFrame(X_number_scaled, columns=X_number.columns)

# Print unique values for each column in the scaled DataFrame
for column in X_number_scaled_df.columns:
    unique_values = X_number_scaled_df[column].unique()
    print(f"Unique values in scaled column '{column}': {unique_values}")

# Print values in the column 'X' of the scaled DataFrame
print(X_number_scaled_df['bwd_URG_flag_count'])

prepared_X_number = X_number_scaled_df.drop('bwd_URG_flag_count', axis=1)

Unique values in scaled column 'id.orig_p': [ 0.21120223  0.8654049   0.53075265 ...  1.57162496 -0.31023022
 -0.31101677]
Unique values in scaled column 'id.resp_p': [ 0.16526578 -0.17774786 -0.18288451 ...  8.4984237   9.16314343
 10.96134919]
Unique values in scaled column 'flow_duration': [ 0.21693057  0.21594588  0.21779557 ... -0.02902422 -0.02930261
 -0.02930286]
Unique values in scaled column 'fwd_pkts_tot': [ 3.01353487e-01  2.56583664e-01  3.90893133e-01  3.46123310e-01
 -5.68050970e-02  2.11813841e-01  1.67044018e-01 -1.20352740e-02
  7.42611011e+01  9.56870685e+00  3.27345490e-02  1.22274195e-01
  4.80432779e-01  4.01724880e+00  1.68921800e+00  7.49051717e-01
  7.04281894e-01  1.33105942e+00  7.75043720e-02 -1.01574920e-01
  6.14742248e-01  3.12185234e+00  6.59512071e-01  5.25202602e-01
  4.35662956e-01  5.69972425e-01  1.01767065e+00  1.46536888e+00
  1.59967835e+00  8.83361186e-01  3.21139198e+00  3.52478074e+00
  9.28131009e-01  8.38591363e-01  3.92770915e+00  7.93821540

In [None]:
print(X_categorical.columns.values)
print(X_categorical['proto'].unique())
print(X_categorical['service'].unique())

['proto' 'service']
['tcp' 'udp' 'icmp']
['mqtt' '-' 'http' 'dns' 'ntp' 'ssl' 'dhcp' 'irc' 'ssh' 'radius']


In [None]:
# Replace missing values in X_categorical
X_categorical_filled = X_categorical.replace('-', np.NaN)

# Separate 'service' column into two sets based on missing values
X_temp = X_categorical_filled[['service']].copy()  # Make a copy to avoid SettingWithCopyWarning
X_service_no_missing = X_temp.dropna()  # Rows with no missing values in 'service'
X_service_missing = X_temp[X_temp.isnull().any(axis=1)]  # Rows with missing values in 'service'

# Assuming X_combined has 'proto' column
proto_encoder = LabelEncoder()
service_encoder = LabelEncoder()

# Encode 'proto' column
X_categorical_filled['proto'] = proto_encoder.fit_transform(X_categorical_filled['proto'])

# Encode 'service' column for rows with no missing values
X_service_no_missing.loc[:, 'service'] = service_encoder.fit_transform(X_service_no_missing['service'])

# Replace 'service' column in X_temp with the encoded values from X_service_no_missing and change data type
X_temp.loc[X_service_no_missing.index, 'service'] = X_service_no_missing['service']
X_categorical_filled['service'] = X_temp['service']

# Insert X_categorical_filled after column 2 in X_combined
insert_index = 2
X_combined = pd.concat([prepared_X_number.iloc[:, :insert_index], X_categorical_filled, prepared_X_number.iloc[:, insert_index:]], axis=1)

In [None]:
# Create a copy of X_combined and remove null values
X_combined_cleaned = X_combined.dropna().copy()

# Convert 'service' column to float64 type
X_combined_cleaned['service'] = X_combined_cleaned['service'].astype(float)

# Reindex y to match the indices of X_combined_cleaned
y_reduced = y.reindex(X_combined_cleaned.index)

# Drop rows with NaN values in y_reduced if necessary
y_reduced = y_reduced.dropna()

# Convert y_reduced to a 1d array using ravel()
y_reduced = np.ravel(y_reduced)


In [None]:
pd.set_option('display.max_rows', 5)  # Show rows
pd.set_option('display.max_columns', None)  # Show columns

print(X_combined)

        id.orig_p  id.resp_p  proto service  flow_duration  fwd_pkts_tot  \
0        0.211202   0.165266      1       4       0.216931      0.301353   
1        0.865405   0.165266      1       4       0.215946      0.301353   
...           ...        ...    ...     ...            ...           ...   
123115   1.290353  12.173026      1     NaN      -0.029303     -0.056805   
123116   1.290353  12.197568      1     NaN      -0.029303     -0.056805   

        bwd_pkts_tot  fwd_data_pkts_tot  bwd_data_pkts_tot  fwd_pkts_per_sec  \
0           0.093600           0.077860           0.067497         -0.948870   
1           0.093600           0.077860           0.067497         -0.948870   
...              ...                ...                ...               ...   
123115     -0.027546          -0.074928          -0.025400         -0.496366   
123116     -0.027546          -0.074928          -0.025400         -0.496366   

        bwd_pkts_per_sec  flow_pkts_per_sec  down_up_ratio  \


In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the string labels in y_reduced to integer labels
y_encoded_reduced = label_encoder.fit_transform(y_reduced)

# Initialize XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=0)

# Train XGBoost classifier on the encoded target variable
xgb_classifier.fit(X_combined_cleaned, y_encoded_reduced)

# Retrieve feature importance scores from XGBoost
xgb_feature_importance_scores = xgb_classifier.feature_importances_

# Normalize feature importance scores for XGBoost
scaler = MinMaxScaler()
xgb_normalized_importance = scaler.fit_transform(xgb_feature_importance_scores.reshape(-1, 1)).flatten()

# Create DataFrame to visualize feature importance for XGBoost
feature_importance_df_xgb = pd.DataFrame({'Feature': X_combined_cleaned.columns, 'Importance': xgb_normalized_importance})
feature_importance_df_xgb = feature_importance_df_xgb.sort_values(by='Importance', ascending=False)

# Set display options
pd.set_option('display.max_rows', None)  # Show rows
pd.set_option('display.max_columns', None)  # Show columns

# Print the sorted feature importance DataFrame for XGBoost
print("XGBoost Feature Importance:")
print(feature_importance_df_xgb)

XGBoost Feature Importance:
                     Feature  Importance
43               fwd_iat.min    1.000000
37      bwd_pkts_payload.std    0.965452
41     flow_pkts_payload.avg    0.543097
53              flow_iat.min    0.222397
19       flow_FIN_flag_count    0.217896
80      bwd_init_window_size    0.199713
10          bwd_pkts_per_sec    0.166216
81      fwd_last_window_size    0.148826
5               fwd_pkts_tot    0.110042
79      fwd_init_window_size    0.103345
26       flow_CWR_flag_count    0.050727
18       bwd_header_size_max    0.043866
30      fwd_pkts_payload.tot    0.040355
38     flow_pkts_payload.min    0.037288
31      fwd_pkts_payload.avg    0.037085
3                    service    0.036179
9           fwd_pkts_per_sec    0.030600
28      fwd_pkts_payload.min    0.026999
7          fwd_data_pkts_tot    0.025001
1                  id.resp_p    0.022227
29      fwd_pkts_payload.max    0.021828
15       fwd_header_size_max    0.020946
12             down_up_ratio 

In [None]:
# Set the threshold for feature importance
threshold = 0.001

# Filter features with importance scores above the threshold
selected_features_xgb = feature_importance_df_xgb[feature_importance_df_xgb['Importance'] >= threshold]['Feature']

# Print the selected features
print("Selected Features based on XGBoost Feature Importance:")
print(selected_features_xgb)

# Filter X_combined_cleaned DataFrame to keep only selected features
X_combined_selected = X_combined[selected_features_xgb]

# Print the shape of the filtered DataFrame
print("Shape of X_selected_xgb:", X_combined_selected.shape)
# print("Shape of X_selected_xgb:", X_combined.shape)

Selected Features based on XGBoost Feature Importance:
43                 fwd_iat.min
37        bwd_pkts_payload.std
41       flow_pkts_payload.avg
53                flow_iat.min
19         flow_FIN_flag_count
80        bwd_init_window_size
10            bwd_pkts_per_sec
81        fwd_last_window_size
5                 fwd_pkts_tot
79        fwd_init_window_size
26         flow_CWR_flag_count
18         bwd_header_size_max
30        fwd_pkts_payload.tot
38       flow_pkts_payload.min
31        fwd_pkts_payload.avg
3                      service
9             fwd_pkts_per_sec
28        fwd_pkts_payload.min
7            fwd_data_pkts_tot
1                    id.resp_p
29        fwd_pkts_payload.max
15         fwd_header_size_max
12               down_up_ratio
16         bwd_header_size_tot
32        fwd_pkts_payload.std
0                    id.orig_p
77                    idle.avg
62           bwd_subflow_bytes
60            bwd_subflow_pkts
69                  active.min
8            bw

In [None]:
# Assuming data is your DataFrame and y is your target variable
data = X_combined_selected.merge(y.reset_index(drop=True), left_index=True, right_index=True)

# Get the unique class labels and their counts dynamically
class_counts = dict(y.value_counts())

# Calculate the mean class count
mean_class_count = (sum(class_counts.values()) / len(class_counts))

# Define a function for oversampling and undersampling
def sample_data(label_count):
    label, count = label_count
    label = label[0]

    if count < mean_class_count:
        # Add data to reach the mean for classes with counts less than mean
        additional_samples = int(mean_class_count) - count
        sampled_data = data[data[y.columns[0]] == label].sample(n=additional_samples, replace=True, random_state=42)
        return sampled_data
    elif count > mean_class_count:
        # Remove data to match the mean for classes with counts greater than mean
        reduced_samples = count - int(mean_class_count)
        if reduced_samples > 0:  # Ensure taking positive number of samples
            sampled_data = data[data[y.columns[0]] == label].sample(n=int(mean_class_count), random_state=42)
            return sampled_data
    return pd.DataFrame()  # Return empty DataFrame if no sampling is needed

# Apply sampling function to each class count sequentially
sampled_results = []
for label_count in class_counts.items():
    sampled_results.append(sample_data(label_count))

# Combine oversampled and undersampled data
combined_data = pd.concat(sampled_results)

# Separate the combined data back into X_filled and y
X_filled_combined = combined_data.drop(columns=[y.columns[0]])  # Use list for single column drop
y_combined = combined_data[y.columns[0]]

# Set display options
pd.set_option('display.max_rows', None)  # Show rows
pd.set_option('display.max_columns', None)  # Show columns

print(class_counts)

# Print the number of data samples in each class
print("Number of data samples in each class:")
print(y_combined.value_counts())


{('DOS_SYN_Hping',): 94659, ('Thing_Speak',): 8108, ('ARP_poisioning',): 7750, ('MQTT_Publish',): 4146, ('NMAP_UDP_SCAN',): 2590, ('NMAP_XMAS_TREE_SCAN',): 2010, ('NMAP_OS_DETECTION',): 2000, ('NMAP_TCP_scan',): 1002, ('DDOS_Slowloris',): 534, ('Wipro_bulb',): 253, ('Metasploit_Brute_Force_SSH',): 37, ('NMAP_FIN_SCAN',): 28}
Number of data samples in each class:
Attack_type
DOS_SYN_Hping                 10259
NMAP_FIN_SCAN                 10231
Metasploit_Brute_Force_SSH    10222
Wipro_bulb                    10006
DDOS_Slowloris                 9725
NMAP_TCP_scan                  9257
NMAP_OS_DETECTION              8259
NMAP_XMAS_TREE_SCAN            8249
NMAP_UDP_SCAN                  7669
MQTT_Publish                   6113
ARP_poisioning                 2509
Thing_Speak                    2151
Name: count, dtype: int64


In [None]:
def impute_missing_values(chunk):
    print("Input Chunk Shape:", chunk.shape)  # Debugging print
    imputer = KNNImputer(n_neighbors=5)
    imputed_chunk = imputer.fit_transform(chunk)
    print("Imputed Chunk Shape:", imputed_chunk.shape)  # Debugging print
    if imputed_chunk.shape[1] != len(chunk.columns):
        raise ValueError("Number of columns in imputed_chunk does not match the original DataFrame")
    return pd.DataFrame(imputed_chunk, columns=chunk.columns)

# Check if 'service' feature exists in X_filled_combined
if 'service' in X_filled_combined.columns:
    # Split X_filled_combined into chunks based on the number of CPU cores
    num_cores = cpu_count()
    total_rows = len(X_filled_combined)
    chunk_size = total_rows // num_cores
    remainder = total_rows % num_cores  # Calculate remainder rows

    # Adjust chunk sizes for uneven division
    chunks = [X_filled_combined[i:i + chunk_size] for i in range(0, total_rows - remainder, chunk_size)]
    if remainder:
        chunks[-1] = pd.concat([chunks[-1], X_filled_combined[-remainder:]])  # Add remaining rows to the last chunk

    # Apply imputation to each chunk sequentially
    imputed_chunks = []
    for chunk in chunks:
        imputed_chunks.append(impute_missing_values(chunk))

    # Concatenate the imputed chunks back into a single DataFrame
    X_filled = pd.concat(imputed_chunks)
else:
    X_filled = X_filled_combined.copy()  # If 'service' feature is not present, simply copy X_filled_combined


Input Chunk Shape: (47325, 70)
Imputed Chunk Shape: (47325, 70)
Input Chunk Shape: (47325, 70)
Imputed Chunk Shape: (47325, 70)


In [None]:
# Check for NaN values
nan_locations = pd.isna(X_filled)

# Print all occurrences of NaN values
print("NaN values and their locations:")
for idx, row in nan_locations.iterrows():
    for col, value in row.items():
        if value:
            print(f"Index: {idx}, Column: {col}, Value: NaN")


NaN values and their locations:


In [None]:
# Initialize an empty DataFrame to track time used by each model
time_df = pd.DataFrame(columns=['Classifier', 'Time Used'])
y_array = y_combined.to_numpy()

# Create a DataFrame to store classification reports
classification_reports_df = pd.DataFrame(columns=['Classifier', 'Classification Report'])

def evaluate_classifier(classifier):
    start_time = time.time()  # Record start time

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UndefinedMetricWarning)
        warnings.simplefilter("ignore", category=ConvergenceWarning)
        warnings.simplefilter("ignore", category=FutureWarning)

        cross = cross_validate(classifier, X_filled, y_array.ravel(), scoring=('accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'), cv=kfold, error_score='raise')
    accuracy = cross['test_accuracy']
    error = 1 - accuracy.mean()

    # Calculate time used
    end_time = time.time()
    time_used = end_time - start_time

    # Append data to time_df
    time_df.loc[len(time_df)] = [classifier.__class__.__name__, time_used]

    return {
        'Classifier': classifier.__class__.__name__,
        'Error': error,
        'Accuracy': accuracy.mean(),
        'Precision': cross['test_precision_weighted'].mean(),
        'Recall': cross['test_recall_weighted'].mean(),
        'F1 Score': cross['test_f1_weighted'].mean()
    }

if __name__ == '__main__':
    num_cores = 1  # Sequential processing

    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    classifiers = [
        DecisionTreeClassifier(),
        GaussianNB(),
        KNeighborsClassifier(),
        MLPClassifier(),
        SVC(),
        RandomForestClassifier(),
        AdaBoostClassifier()
    ]

    results = []
    time_df = pd.DataFrame(columns=['Classifier', 'Time Used'])  # Initialize time_df

    for classifier in classifiers:
        # Print the classifier's name
        print(f"Evaluating {classifier.__class__.__name__}...")

        # Record start time
        start_time = time.time()

        # Evaluate classifier
        result = evaluate_classifier(classifier)
        results.append(result)

        # Calculate time used
        end_time = time.time()
        time_used = end_time - start_time

        # Append data to time_df
        time_df.loc[len(time_df)] = [classifier.__class__.__name__, time_used]

        # Print the time used by each model
        print(f"Time used by {classifier.__class__.__name__} model: {time_used:.2f} seconds")

    # Create results_df after evaluating classifiers
    results_df = pd.DataFrame(results)

    # Print the number of features in X_filled
    print(f"\nNumber of features in X: {len(X_filled.columns)}\n")

    print("\t Classifier".ljust(30), "\t Error".ljust(20), "Accuracy".ljust(20), "Precision".ljust(20),
          "Recall".ljust(20), "F1 Score".ljust(20))
    print("\t" + "-" * 133)
    max_classifier_length = max(results_df['Classifier'].apply(len))

    for index, row in results_df.iterrows():
        classifier_column = f"{row['Classifier']:<{max_classifier_length}}"
        error_column = f"{row['Error']:<20.6f}"
        accuracy_column = f"{row['Accuracy']:<20.6f}"
        precision_column = f"{row['Precision']:<20.6f}"
        recall_column = f"{row['Recall']:<20.6f}"
        f1_score_column = f"{row['F1 Score']:<20.6f}"

        if row['Classifier'] == 'RandomForestClassifier':
            print(
                f"\t{classifier_column}\t\t{error_column}{accuracy_column}{precision_column}{recall_column}{f1_score_column}")
        else:
            print(
                f"\t{classifier_column}\t\t{error_column}{accuracy_column}{precision_column}{recall_column}{f1_score_column}")

Evaluating DecisionTreeClassifier...
Time used by DecisionTreeClassifier model: 7.99 seconds
Evaluating GaussianNB...
Time used by GaussianNB model: 5.96 seconds
Evaluating KNeighborsClassifier...
Time used by KNeighborsClassifier model: 62.26 seconds
Evaluating MLPClassifier...
Time used by MLPClassifier model: 398.72 seconds
Evaluating SVC...
Time used by SVC model: 204.90 seconds
Evaluating RandomForestClassifier...
Time used by RandomForestClassifier model: 39.33 seconds
Evaluating AdaBoostClassifier...
Time used by AdaBoostClassifier model: 79.91 seconds

Number of features in X: 70

	 Classifier                   	 Error              Accuracy             Precision            Recall               F1 Score            
	-------------------------------------------------------------------------------------------------------------------------------------
	DecisionTreeClassifier		0.001384            0.998616            0.998613            0.998616            0.998614            
	Gaussi

In [None]:
print(y_array.shape)

(94650,)


In [None]:
# Define the number of folds for cross-validation
num_folds = 5
k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred))  # print classification report
    return accuracy_score(y_true, y_pred)  # return accuracy score

rf_classifier = RandomForestClassifier()
cross = 4(rf_classifier, X_filled, y_array, scoring=make_scorer(classification_report_with_accuracy_score), cv=k_fold, return_estimator=True)

# Access trained model for each fold
trained_classifiers = cross['estimator']
for fold, trained_classifier in enumerate(trained_classifiers):
    # Save the trained model
    joblib.dump(trained_classifier, f'/content/drive/MyDrive/RT_IOT/{trained_classifier}_fold_{fold + 1}.joblib')

                            precision    recall  f1-score   support

            ARP_poisioning       0.99      1.00      0.99       494
            DDOS_Slowloris       1.00      1.00      1.00      1967
             DOS_SYN_Hping       1.00      1.00      1.00      2014
              MQTT_Publish       1.00      1.00      1.00      1197
Metasploit_Brute_Force_SSH       1.00      1.00      1.00      2000
             NMAP_FIN_SCAN       1.00      1.00      1.00      2016
         NMAP_OS_DETECTION       1.00      1.00      1.00      1688
             NMAP_TCP_scan       1.00      1.00      1.00      1919
             NMAP_UDP_SCAN       1.00      1.00      1.00      1510
       NMAP_XMAS_TREE_SCAN       1.00      1.00      1.00      1651
               Thing_Speak       0.99      0.99      0.99       432
                Wipro_bulb       1.00      1.00      1.00      2042

                  accuracy                           1.00     18930
                 macro avg       1.00      1.0