In [24]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics 
from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import os
import warnings
import time
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [6]:
#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}

In [41]:
features = {'FTP-Patator.csv': ['Source Port','Bwd Packet Length Mean','Destination_IP_Encoded','Packet Length Std','Timestamp','Packet Length Variance','Fwd Packet Length Max','Avg Fwd Segment Size','Max Packet Length','Packet Length Mean','Average Packet Size','Fwd Packet Length Std','Fwd Packet Length Mean','Source_IP_Encoded','Destination Port','Label'],
 'PortScan.csv': ['Flow IAT Max','Bwd Packet Length Min','Total Fwd Packets','PSH Flag Count','Source_IP_Encoded','Flow Duration','Bwd Packets/s','Fwd Packet Length Max','Avg Fwd Segment Size','Average Packet Size','Packet Length Mean','Fwd Packet Length Mean','Timestamp','Total Length of Fwd Packets','Subflow Fwd Bytes','Label'],
 'DoS Slowhttptest.csv': ['Flow Duration','Timestamp','Active Mean','min_seg_size_forward','Flow IAT Min','Idle Max','Fwd IAT Std','Fwd IAT Total','Fwd IAT Mean','Flow IAT Std','Fwd IAT Min','Flow IAT Mean','Fwd IAT Max','Flow IAT Max','Source_IP_Encoded','Label'],
 'Web Attack.csv': ['Destination Port','Fwd IAT Max','Fwd IAT Std','Flow IAT Mean','Flow Duration','Fwd IAT Mean','Destination_IP_Encoded','Flow IAT Std','Fwd IAT Total','Init_Win_bytes_forward','Fwd IAT Min','Source Port','Source_IP_Encoded','Init_Win_bytes_backward','Timestamp','Label'],
 'DoS Hulk.csv': ['Bwd Packet Length Mean','Bwd Packets/s','Flow IAT Std','Flow IAT Max','Destination Port','Idle Min','Min Packet Length','Fwd IAT Max','Idle Max','Idle Mean','Timestamp','Fwd IAT Std','Destination_IP_Encoded','Packet Length Variance','Source_IP_Encoded','Label'],
 'Heartbleed.csv': ['Subflow Fwd Bytes','Flow IAT Min','Flow Duration','Subflow Bwd Bytes','Bwd Packet Length Std','Packet Length Variance','Bwd Packet Length Mean','Average Packet Size','Total Fwd Packets','Total Length of Bwd Packets','Total Backward Packets','Packet Length Mean','Subflow Bwd Packets','Max Packet Length','Bwd Header Length','Label'],
 'DoS slowloris.csv': ['Min Packet Length','Fwd IAT Total','Timestamp','Subflow Bwd Bytes','Fwd IAT Min','Destination Port','Fwd IAT Max','Flow IAT Mean','Fwd IAT Mean','min_seg_size_forward','Total Length of Bwd Packets','Avg Bwd Segment Size','Bwd Packet Length Mean','Destination_IP_Encoded','Source_IP_Encoded','Label'],
 'Bot.csv': ['Flow IAT Min','Bwd IAT Min','Total Length of Bwd Packets','Bwd Packets/s','Bwd Packet Length Max','Source Port','Packet Length Mean','Subflow Bwd Bytes','Average Packet Size','Init_Win_bytes_forward','Init_Win_bytes_backward','Avg Bwd Segment Size','Bwd Packet Length Mean','Destination Port','Timestamp','Label'],
 'Infiltration.csv': ['Bwd IAT Total','Fwd IAT Max','Fwd Packet Length Max','Fwd IAT Total','Fwd IAT Min','Flow Duration','Bwd IAT Std','Fwd Packet Length Std','Avg Fwd Segment Size','Subflow Fwd Bytes','Total Length of Fwd Packets','Fwd Packet Length Mean','Destination_IP_Encoded','Destination Port','Source_IP_Encoded','Label'],
 'DoS GoldenEye.csv': ['Idle Min','Destination Port','Timestamp','Fwd Packets/s','Fwd IAT Total','Destination_IP_Encoded','Fwd IAT Mean','min_seg_size_forward','Idle Mean','Fwd IAT Max','Flow IAT Mean','Flow Duration','Flow Packets/s','Flow IAT Max','Source_IP_Encoded','Label'],
 'SSH-Patator.csv': ['Average Packet Size','Avg Bwd Segment Size','Bwd Packets/s','Bwd Packet Length Min','min_seg_size_forward','Init_Win_bytes_forward','Source Port','Min Packet Length','Fwd Packet Length Min','Flow Bytes/s','Destination_IP_Encoded','Init_Win_bytes_backward','Timestamp','Source_IP_Encoded','Destination Port','Label'],
 'DDoS.csv': ['Packet Length Variance','Fwd IAT Total','Total Fwd Packets','Fwd IAT Std','Subflow Fwd Packets','Total Length of Fwd Packets','Source Port','Fwd Header Length','act_data_pkt_fwd','Destination_IP_Encoded','Avg Fwd Segment Size','Fwd Packet Length Max','Fwd Packet Length Mean','Source_IP_Encoded','Timestamp','Label']}



In [46]:
csv_files=os.listdir("smote_processed_files")
folder_path = 'smote_processed_files/'
path="smote_processed_files/"
repetition=10

csv_files

['smote_SSH-Patator.csv',
 'smote_DoS Slowhttptest.csv',
 'smote_DoS Hulk.csv',
 'smote_DDoS.csv',
 'smote_Heartbleed.csv',
 'smote_Bot.csv',
 'smote_Web Attack.csv',
 'smote_DoS GoldenEye.csv',
 'smote_DoS slowloris.csv',
 'smote_FTP-Patator.csv',
 'smote_Infiltration.csv',
 'smote_PortScan.csv']

In [None]:
csv_files = os.listdir("smote_processed_files")
folder_path = 'smote_processed_files/'

# Print the output header
print('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File", "ML algorithm", "accuracy", "Precision", "Recall", "F1-score", "Time"))

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing file: {filename}")

        key = filename[6:]
        if key in features:
            feature_list = list(features[key])
        df = pd.read_csv(folder_path + filename, usecols=feature_list).fillna(0)


        # Process the 'Label' column: change 'BENIGN' to 1 and all other labels to 0
        df["Label"] = df["Label"].apply(lambda x: 1 if x == "BENIGN" else 0)

        # Split the data into features (X) and target (y)
        y = df["Label"]
        del df["Label"]  # Remove the label column from the dataframe
        feature_list.remove('Label')  # Remove 'Label' from the feature list
        X = df[feature_list]
    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
            second=time.time()#time stamp for processing time

            # cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).


            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    
                  
            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')
          
            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )

            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (filename[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        

File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
Processing file: smote_SSH-Patator.csv
smote_SSH-Patator Naive Bayes        0.98            0.98            0.98            0.98            0.0112         
smote_SSH-Patator QDA                0.5             0.25            0.5             0.33            0.0427         
smote_SSH-Patator Random Forest      1.0             1.0             1.0             1.0             0.0525         
smote_SSH-Patator ID3                1.0             1.0             1.0             1.0             0.0238         
smote_SSH-Patator AdaBoost           1.0             1.0             1.0             1.0             0.6564         
smote_SSH-Patator MLP                0.96            0.96            0.96            0.96            0.918          
smote_SSH-Patator Nearest Neighbors  1.0             1.0             1.0             1.0             0.4791         
Processing file: smote_Do