In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pd.options.display.max_rows = 200

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Copy of CIC-IDS-2018(15).csv")
df

In [None]:
columns_to_drop = [
  'Dst Port',
  'Timestamp',
  'Fwd PSH Flags',
  'Bwd PSH Flags',
  'Fwd URG Flags',
  'Bwd URG Flags',
  'Flow Byts/s',
  'Flow Pkts/s']
df.drop(columns=columns_to_drop, inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df=df[df["Label"]!="Label"]
attack_mapping = {
  'Benign': 0,
  'DDOS attack-HOIC': 1,
  'DoS attacks-Hulk': 2,
  'Bot': 3,
  'FTP-BruteForce': 4,
  'SSH-Bruteforce': 5,
  'Infilteration': 6,
  'DoS attacks-GoldenEye': 7,
  'DoS attacks-Slowloris': 8,
  'DDOS attack-LOIC-UDP': 9,
  'Brute Force -Web':10,
  'Brute Force -XSS':11,
  'SQL Injection':12,
}
df.loc[:, 'Label'] = df["Label"].map(attack_mapping)
df

In [None]:
df["Label"].value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df_scaled = df.copy()
feature_cols = df.drop('Label', axis=1).columns
df_scaled[feature_cols] = scaler.fit_transform(df[feature_cols])
df_scaled=df_scaled.groupby("Label").apply(lambda a:a)
df_scaled

In [None]:
def metrics(output,y_test,y_test_types,att_type):
    tp=tn=fp=fn=tpz=fnz=0
    for i in range(len(output)):
        if output[i]<0.5:
            if y_test[i]==0:
                tn+=1
            else:
                fn+=1
        else:
            if y_test[i]==1:
                tp+=1
            else:
                fp+=1
        if y_test_types[i]==att_type:
            if output[i]>=0.5:
                tpz+=1
            else:
                fnz+=1

    accuracy=((tp+tn)*100)/(tp+tn+fp+fn)
    dr=(tp*100)/(tp+fn)
    far=(fp*100)/(fp+tn)
    zdr=(tpz*100)/(tpz+fnz)
    return [accuracy,dr,far,zdr]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
from sklearn.model_selection import train_test_split

Simulating Zero Day Attacks

In [None]:
metricarr=[]

for att_type in range(1,13):

    # Organizing data for zero day attack
    df_train, df_test = train_test_split(df_scaled,train_size=0.8,random_state=52)
    zday=df_train[df_train["Label"]==att_type]
    df_test = pd.concat([df_test, zday]).sample(frac=1).reset_index(drop=True)
    zday_train=df_train[df_train["Label"]!=att_type].sample(frac=1).reset_index(drop=True)
    X_train=zday_train.drop(["Label"],axis=1)
    y_train=zday_train["Label"].apply(lambda a:1 if a>0 else 0)
    X_test=df_test.drop(["Label"],axis=1)
    y_test_types=df_test["Label"]
    y_test=y_test_types.apply(lambda a:1 if a>0 else 0)

    #Declaring the model
    tf.keras.backend.clear_session()
    model = tf.keras.Sequential([
        layers.Dense(100, activation='relu', input_shape=(71,), kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
        layers.Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
        layers.Dense(1, activation='sigmoid')
    ])
    early_stopping = EarlyStopping(monitor='loss', patience=3, verbose=1, restore_best_weights=True)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])

    #Training the model
    model.fit(X_train, y_train, epochs=50, callbacks=[early_stopping],batch_size=64)

    #Getting the output
    output=model.predict(X_test)

    #Storing metrics in metric array
    metricarr.append(metrics(output,y_test,y_test_types,att_type))


In [None]:
print(*metricarr,sep="\n")

In [None]:
att_types=[  'Benign',
  'DDOS attack-HOIC',
  'DoS attacks-Hulk',
  'Bot',
  'FTP-BruteForce',
  'SSH-Bruteforce',
  'Infilteration',
  'DoS attacks-GoldenEye',
  'DoS attacks-Slowloris',
  'DDOS attack-LOIC-UDP',
  'Brute Force -Web',
  'Brute Force -XSS',
  'SQL Injection']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

mdf = pd.DataFrame(metricarr, columns=['Accuracy', 'DR', 'FAR', 'ZDR'])
plt.figure(figsize=(10, 6))
sns.barplot(data=mdf, x=mdf.index+1, y='DR', label='DR')
plt.xlabel('Attack Types')
plt.ylabel('Detection Rate')
plt.title('Detection Rates')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=mdf, x=mdf.index+1, y='ZDR', label='ZDR')
plt.xlabel('Attack Types')
plt.ylabel('Zero Day Detection Rate')
plt.title('Zero Day Detection Rates')
plt.legend()
plt.show()

Wasserstein Distance Calculation

In [None]:
from scipy.stats import wasserstein_distance as wd
from scipy.stats import ks_2samp as ks

In [None]:
wdarr=np.array([])
ksarr=np.array([])
for att_type in range(1,13):
    X_nz = (df_scaled[~df_scaled["Label"].isin([0,att_type])])
    X_z = (df_scaled[df_scaled["Label"] == att_type])
    arr1=np.array([])
    arr2=np.array([])
    for i in range(71):
        arr1=np.append(arr1,wd(X_nz.iloc[:,i],X_z.iloc[:,i]))
        arr2=np.append(arr2,ks(X_nz.iloc[:,i],X_z.iloc[:,i]))
    wdarr=np.append(wdarr,sum(arr1)/71)
    ksarr=np.append(ksarr,sum(arr2)/71)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
wdf = pd.DataFrame(wdarr, columns=["WD"])
plt.figure(figsize=(10, 6))
sns.barplot(data=wdf, x=wdf.index+1, y='WD', label='Wasserstein Distance')
plt.xlabel('Attack Types')
plt.ylabel('Wasserstein Distance')
plt.title('Wasserstein Distances')
plt.legend()
plt.show()

In [None]:
ksdf = pd.DataFrame(ksarr, columns=["KS"])
plt.figure(figsize=(10, 6))
sns.barplot(data=ksdf, x=ksdf.index+1, y='KS', label='Kolmogorov-Smirnov')
plt.xlabel('Attack Types')
plt.ylabel('Kolmogorov-Smirnov Test')
plt.title('Kolmogorov-Smirnov Test')
plt.legend()
plt.show()