# Evaluation Code for KSQL and ML
Before evaluating, please run KSQL infrastructure with `test.pcap` generated from `../datasets/preparation2.py`, then run `../rule-detection/evaluation/consumer/script.py` to consume the results from Kafka, and lastly run `../rule-detection/evaluation/ksql_evaluation_dos.py` and `../rule-detection/evaluation/ksql_evaluation_ps.py` to find list of packets in the pcap (in the form of index) that is malicious/detected.

**Metrics**
- Accuracy, Precision, Recall, F1-Score
- Inference Time for ML

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib
import time

In [2]:
DOS_MODEL_LOCATION = "../ai-detection/notebooks/models/postfs"
PS_MODEL_LOCATION = "../ai-detection/notebooks/modelsps/postfs"

In [3]:
X_val = pd.read_csv(f"../ai-detection/notebooks/models/X_val.csv", index_col=0)
y_val = pd.read_csv(f"../ai-detection/notebooks/modelsps/y_val.csv", index_col=0)['0']

## Inference Time

In [38]:
TRIES = 100

In [None]:
# this is in milliseconds, not seconds
def calculate_inference_time_in_ms(foo, tries: int):
    times = []
    for _ in range(tries):
        start = time.time()
        foo()
        end = time.time()
        times.append(((end - start) / len(X_val)) * 1000)
    return np.average(times)

In [9]:
def print_time(name: str, in_ms):
    print(f"Average Inference time for {name}")
    print(in_ms, 'milliseconds')
    print(in_ms / 1000, 'seconds')
    print()

## DoS

In [43]:
dt = joblib.load(f"{DOS_MODEL_LOCATION}/dt_tuned.pkl")
rf = joblib.load(f"{DOS_MODEL_LOCATION}/rf_tuned.pkl")
et = joblib.load(f"{DOS_MODEL_LOCATION}/et_tuned.pkl")
xg = joblib.load(f"{DOS_MODEL_LOCATION}/xg_tuned.pkl")
lgbm = joblib.load(f"{DOS_MODEL_LOCATION}/lgbm_tuned.pkl")
cat = joblib.load(f"{DOS_MODEL_LOCATION}/cat_tuned.pkl")
stk1 = joblib.load(f"{DOS_MODEL_LOCATION}/stk1_tuned.pkl")
stk2 = joblib.load(f"{DOS_MODEL_LOCATION}/stk2_tuned.pkl")
stk3 = joblib.load(f"{DOS_MODEL_LOCATION}/stk3_tuned.pkl")

fs = ['Fwd IAT Min', 'Bwd IAT Mean', 'Fwd Seg Size Min', 'Flow Packets/s', 'Fwd Packets/s', 'Total Length of Bwd Packet', 'Bwd Act Data Pkts', 'Bwd IAT Min', 'Flow IAT Mean', 'FIN Flag Count', 'Bwd Init Win Bytes', 'Fwd Bulk Rate Avg', 'Active Min', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packets/s', 'Bwd Header Length', 'Bwd Packet Length Mean', 'Subflow Bwd Bytes', 'Flow Duration', 'Flow IAT Min', 'Total Bwd packets', 'Fwd Segment Size Avg', 'FWD Init Win Bytes', 'Packet Length Mean', 'Down/Up Ratio', 'ACK Flag Count', 'Fwd IAT Total', 'SYN Flag Count', 'Bwd Packet Length Max', 'Subflow Fwd Packets', 'Fwd Bytes/Bulk Avg', 'Flow IAT Max', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd Header Length']

top_3_models = ['rf', 'xg', 'lgbm']

X_test_fs = X_val[fs]

In [44]:
dos_inference_times: list[tuple] = []

# Decision Tree
dos_inference_times.append(("DT", calculate_inference_time_in_ms(lambda: dt.predict(X_test_fs), TRIES)))

# Random Forest
dos_inference_times.append(("RF", calculate_inference_time_in_ms(lambda: rf.predict(X_test_fs), TRIES)))

# Extra Trees
dos_inference_times.append(("ET", calculate_inference_time_in_ms(lambda: et.predict(X_test_fs), TRIES)))

# XGBoost
dos_inference_times.append(("XG", calculate_inference_time_in_ms(lambda: xg.predict(X_test_fs), TRIES)))

# LightGBM
dos_inference_times.append(("LGBM", calculate_inference_time_in_ms(lambda: lgbm.predict(X_test_fs), TRIES)))

# CatBoost
dos_inference_times.append(("Cat", calculate_inference_time_in_ms(lambda: cat.predict(X_test_fs), TRIES)))

# Ensemble 1 (Change based on training's top 3 models results)
def stk1_timer():
    rf_test = rf.predict(X_test_fs)
    xg_test = xg.predict(X_test_fs)
    lgbm_test = lgbm.predict(X_test_fs)

    rf_test = rf_test.reshape(-1, 1)
    xg_test = xg_test.reshape(-1, 1)
    lgbm_test = lgbm_test.reshape(-1, 1)

    x_test = np.concatenate([rf_test, xg_test, lgbm_test], axis=1)

    stk1.predict(x_test)

dos_inference_times.append(("STK Traditional", calculate_inference_time_in_ms(stk1_timer, TRIES)))

# Ensemble 2 (Change based on training's top 3 models results)
def stk2_timer():
    rf_prob_test = rf.predict_proba(X_test_fs)
    xg_prob_test = xg.predict_proba(X_test_fs)
    lgbm_prob_test = lgbm.predict_proba(X_test_fs)

    x_test = np.concatenate([rf_prob_test, xg_prob_test, lgbm_prob_test], axis=1)

    stk2.predict(x_test)

dos_inference_times.append(("STK Confidence", calculate_inference_time_in_ms(stk2_timer, TRIES)))

# Ensemble 3 (Change based on training's top 3 models results)
def stk3_timer():
    rf_test = rf.predict(X_test_fs)
    xg_test = xg.predict(X_test_fs)
    lgbm_test = lgbm.predict(X_test_fs)

    rf_prob_test = rf.predict_proba(X_test_fs)
    xg_prob_test = xg.predict_proba(X_test_fs)
    lgbm_prob_test = lgbm.predict_proba(X_test_fs)

    rf_test = rf_test.reshape(-1, 1)
    xg_test = xg_test.reshape(-1, 1)
    lgbm_test = lgbm_test.reshape(-1, 1)

    x_test = np.concatenate([rf_test, xg_test, lgbm_test] + [rf_prob_test, xg_prob_test, lgbm_prob_test], axis=1)

    stk3.predict(x_test)

dos_inference_times.append(("STK Hybrid", calculate_inference_time_in_ms(stk3_timer, TRIES)))

In [45]:
dos_times = sorted(dos_inference_times, key=lambda x: x[1])
dos_times

[('DT', 0.00010634339957432386),
 ('XG', 0.00011878373927395645),
 ('Cat', 0.00013363732454691875),
 ('LGBM', 0.00047556850306581535),
 ('ET', 0.0011712116401777232),
 ('RF', 0.002062725721245142),
 ('STK Confidence', 0.00280482610908058),
 ('STK Traditional', 0.002927671784206637),
 ('STK Hybrid', 0.005490533262931026)]

In [46]:
[[a, format(b, '.5f')] for a, b in dos_times]

[['DT', '0.00011'],
 ['XG', '0.00012'],
 ['Cat', '0.00013'],
 ['LGBM', '0.00048'],
 ['ET', '0.00117'],
 ['RF', '0.00206'],
 ['STK Confidence', '0.00280'],
 ['STK Traditional', '0.00293'],
 ['STK Hybrid', '0.00549']]

## Port Scan

In [47]:
dt = joblib.load(f"{PS_MODEL_LOCATION}/dt_tuned.pkl")
rf = joblib.load(f"{PS_MODEL_LOCATION}/rf_tuned.pkl")
et = joblib.load(f"{PS_MODEL_LOCATION}/et_tuned.pkl")
xg = joblib.load(f"{PS_MODEL_LOCATION}/xg_tuned.pkl")
lgbm = joblib.load(f"{PS_MODEL_LOCATION}/lgbm_tuned.pkl")
cat = joblib.load(f"{PS_MODEL_LOCATION}/cat_tuned.pkl")
stk1 = joblib.load(f"{PS_MODEL_LOCATION}/stk1_tuned.pkl")
stk2 = joblib.load(f"{PS_MODEL_LOCATION}/stk2_tuned.pkl")
stk3 = joblib.load(f"{PS_MODEL_LOCATION}/stk3_tuned.pkl")

fs = ['Bwd RST Flags', 'FWD Init Win Bytes', 'RST Flag Count', 'Flow Duration', 'Packet Length Max', 'Flow Packets/s', 'Protocol', 'Total Length of Fwd Packet', 'Flow IAT Max', 'Fwd Seg Size Min', 'Average Packet Size', 'Packet Length Mean', 'Fwd Packet Length Max', 'Bwd IAT Mean', 'Total Fwd Packet', 'Flow Bytes/s', 'Fwd Act Data Pkts', 'Bwd Packets/s', 'Bwd Packet Length Std', 'Fwd IAT Total', 'Flow IAT Mean', 'Flow IAT Min', 'Fwd Segment Size Avg', 'Fwd Header Length', 'Bwd Packet Length Mean', 'Packet Length Std', 'Fwd IAT Min', 'Packet Length Variance', 'Bwd IAT Max']

top_3_models = ['et', 'rf', 'lgbm']

X_test_fs = X_val[fs]

In [48]:
ps_inference_times: list[tuple] = []

# Decision Tree
ps_inference_times.append(("DT", calculate_inference_time_in_ms(lambda: dt.predict(X_test_fs), TRIES)))

# Random Forest
ps_inference_times.append(("RF", calculate_inference_time_in_ms(lambda: rf.predict(X_test_fs), TRIES)))

# Extra Trees
ps_inference_times.append(("ET", calculate_inference_time_in_ms(lambda: et.predict(X_test_fs), TRIES)))

# XGBoost
ps_inference_times.append(("XG", calculate_inference_time_in_ms(lambda: xg.predict(X_test_fs), TRIES)))

# LightGBM
ps_inference_times.append(("LGBM", calculate_inference_time_in_ms(lambda: lgbm.predict(X_test_fs), TRIES)))

# CatBoost
ps_inference_times.append(("Cat", calculate_inference_time_in_ms(lambda: cat.predict(X_test_fs), TRIES)))

# Ensemble 1 (Change based on training's top 3 models results)
def stk1_timer():
    et_test = et.predict(X_test_fs)
    rf_test = rf.predict(X_test_fs)
    lgbm_test = lgbm.predict(X_test_fs)

    et_test = et_test.reshape(-1, 1)
    rf_test = rf_test.reshape(-1, 1)
    lgbm_test = lgbm_test.reshape(-1, 1)

    x_test = np.concatenate([et_test, rf_test, lgbm_test], axis=1)

    stk1.predict(x_test)

ps_inference_times.append(("STK Traditional", calculate_inference_time_in_ms(stk1_timer, TRIES)))

# Ensemble 2 (Change based on training's top 3 models results)
def stk2_timer():
    et_prob_test = et.predict_proba(X_test_fs)
    rf_prob_test = rf.predict_proba(X_test_fs)
    lgbm_prob_test = lgbm.predict_proba(X_test_fs)

    x_test = np.concatenate([et_prob_test, rf_prob_test, lgbm_prob_test], axis=1)

    stk2.predict(x_test)

ps_inference_times.append(("STK Confidence", calculate_inference_time_in_ms(stk2_timer, TRIES)))

# Ensemble 3 (Change based on training's top 3 models results)
def stk3_timer():
    et_test = et.predict(X_test_fs)
    rf_test = rf.predict(X_test_fs)
    lgbm_test = lgbm.predict(X_test_fs)

    et_prob_test = et.predict_proba(X_test_fs)
    rf_prob_test = rf.predict_proba(X_test_fs)
    lgbm_prob_test = lgbm.predict_proba(X_test_fs)

    et_test = et_test.reshape(-1, 1)
    rf_test = rf_test.reshape(-1, 1)
    lgbm_test = lgbm_test.reshape(-1, 1)

    x_test = np.concatenate([et_test, rf_test, lgbm_test] + [et_prob_test, rf_prob_test, lgbm_prob_test], axis=1)

    stk3.predict(x_test)

ps_inference_times.append(("STK Hybrid", calculate_inference_time_in_ms(stk3_timer, TRIES)))

In [49]:
ps_times = sorted(ps_inference_times, key=lambda x: x[1])
ps_times

[('DT', 6.200067718335812e-05),
 ('XG', 6.931041101604196e-05),
 ('Cat', 0.00010874639148311167),
 ('LGBM', 0.00030189499523552743),
 ('RF', 0.001089385293817583),
 ('ET', 0.00408097313355778),
 ('STK Traditional', 0.005626291905190202),
 ('STK Confidence', 0.005728392452822213),
 ('STK Hybrid', 0.011192935011438698)]

In [50]:
[[a, format(b, '.5f')] for a, b in ps_times]

[['DT', '0.00006'],
 ['XG', '0.00007'],
 ['Cat', '0.00011'],
 ['LGBM', '0.00030'],
 ['RF', '0.00109'],
 ['ET', '0.00408'],
 ['STK Traditional', '0.00563'],
 ['STK Confidence', '0.00573'],
 ['STK Hybrid', '0.01119']]