In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics 
from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import os
import warnings
import time
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [2]:
#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=50, max_features=1),
"ID3": DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}

In [3]:
features = ['Source_IP_Encoded', 'Timestamp', 'Packet Length Std', 'Idle Max', 'Source Port', 'Total Length of Fwd Packets', 'Fwd IAT Max',
                     'Flow IAT Max', 'Fwd IAT Std', 'Packet Length Variance', 'Destination_IP_Encoded', 'Destination Port', 'Packet Length Mean', 
                     'Subflow Fwd Bytes', 'Avg Bwd Segment Size', 'Subflow Bwd Bytes', 'Bwd Packet Length Mean', 'Total Length of Bwd Packets', 
                     'Idle Mean', 'Bwd Packet Length Max','Label']




In [4]:
features

['Source_IP_Encoded',
 'Timestamp',
 'Packet Length Std',
 'Idle Max',
 'Source Port',
 'Total Length of Fwd Packets',
 'Fwd IAT Max',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Packet Length Variance',
 'Destination_IP_Encoded',
 'Destination Port',
 'Packet Length Mean',
 'Subflow Fwd Bytes',
 'Avg Bwd Segment Size',
 'Subflow Bwd Bytes',
 'Bwd Packet Length Mean',
 'Total Length of Bwd Packets',
 'Idle Mean',
 'Bwd Packet Length Max',
 'Label']

In [5]:
csv_files=["smote_smote_processed_files_all_data.csv"]
filename = "smote_smote_processed_files_all_data.csv"
repetition=10

In [6]:
df_1 = pd.read_csv("smote_smote_processed_files_all_data.csv")
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824600 entries, 0 to 1824599
Data columns (total 84 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Flow_ID_Encoded              uint64 
 1   Source_IP_Encoded            float64
 2   Source Port                  int64  
 3   Destination_IP_Encoded       float64
 4   Destination Port             int64  
 5   Protocol                     int64  
 6   Timestamp                    int64  
 7   Flow Duration                int64  
 8   Total Fwd Packets            int64  
 9   Total Backward Packets       int64  
 10  Total Length of Fwd Packets  float64
 11  Total Length of Bwd Packets  float64
 12  Fwd Packet Length Max        float64
 13  Fwd Packet Length Min        float64
 14  Fwd Packet Length Mean       float64
 15  Fwd Packet Length Std        float64
 16  Bwd Packet Length Max        float64
 17  Bwd Packet Length Min        float64
 18  Bwd Packet Length Mean       float64
 19  

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

for filename in csv_files:
    if filename.endswith('.csv'):
        print(f"Processing file: {filename}")

        df = pd.read_csv(filename, usecols=features).fillna(0)
        
        # Process the 'Label' column: change 'BENIGN' to 1 and all other labels to 0
        df["Label"] = df["Label"].apply(lambda x: 1 if x == "BENIGN" else 0)

        # Split the data into features (X) and target (y)
        y = df["Label"]
        del df["Label"]  # Remove the label column from the dataframe
        features.remove('Label')  # Remove 'Label' from the feature list
        X = df[features]
    
        cv = StratifiedKFold(n_splits=10)  # 10-fold CV

        # Print the output header
        print('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File", "ML algorithm", "accuracy", "Precision", "Recall", "F1-score", "Time"))
        
        for ii in ml_list:
            precision = []
            recall = []
            f1 = []
            accuracy = []
            t_time = []
        
            for i in range(repetition):  # Repeated cross-validation
                second = time.time()  # time stamp for processing time
        
                # Use cross-validation for more robust performance metrics
                clf = ml_list[ii]
                
                # Stratified cross-validation
                accuracy_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
                precision_scores = cross_val_score(clf, X, y, cv=cv, scoring='precision_macro')
                recall_scores = cross_val_score(clf, X, y, cv=cv, scoring='recall_macro')
                f1_scores = cross_val_score(clf, X, y, cv=cv, scoring='f1_macro')
                
                accuracy.append(np.mean(accuracy_scores))
                precision.append(np.mean(precision_scores))
                recall.append(np.mean(recall_scores))
                f1.append(np.mean(f1_scores))
                t_time.append(float((time.time()-second)) )
        
            print('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (filename[12:], ii, 
                  str(round(np.mean(accuracy), 2)), str(round(np.mean(precision), 2)), 
                  str(round(np.mean(recall), 2)), str(round(np.mean(f1), 2)), 
                  str(round(np.mean(t_time), 4))))

Processing file: smote_smote_processed_files_all_data.csv
File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
processed_files_all_data.csv Naive Bayes        0.91            0.82            0.94            0.86            26.6729        
processed_files_all_data.csv QDA                0.82            0.73            0.79            0.73            358.5927       
