In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import os
from omegaconf import OmegaConf
from pprint import pprint as pp
from icecream import ic
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from EvalCV import *


In [2]:
project = Path().resolve().parent
configs = Path(project) / "config"
data_paths = OmegaConf.load(configs / "data.yaml")
print(OmegaConf.to_yaml(data_paths))
data_int = Path(data_paths.processed) / "modsec_audit_train_v1_prepared.parquet"
df = pd.read_parquet(data_int)

raw: ../data/raw
interim: ../data/interim
processed: ../data/processed



In [3]:
df

Unnamed: 0,layer_type,method,local_port,remote_port,req_content_length,resp_content_length,resp_vary,status_code,target,cf_datacenter,content_length_ratio,large_req,large_resp
0,SINGLE_LAYERED,OPTIONS,80,39486,0,0,"Access-Control-Request-Method, Access-Control-...",204,ssrf,HKG,0.000000,0,0
1,SINGLE_LAYERED,POST,80,39486,50,62,Origin,401,ssrf,HKG,1.240000,0,0
2,SINGLE_LAYERED,OPTIONS,80,39486,0,0,"Access-Control-Request-Method, Access-Control-...",204,ssrf,HKG,0.000000,0,0
3,SINGLE_LAYERED,POST,80,39486,57,33,Origin,200,ssrf,HKG,0.578947,0,0
4,SINGLE_LAYERED,GET,80,39486,0,8846,Origin,200,ssrf,HKG,0.000000,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20003,Unknown,POST,1234,47098,57,62,Origin,401,Unknown,HKG,1.087719,0,0
20004,SINGLE_LAYERED,POST,1234,58032,72,64,Origin,403,sql_injection,HKG,0.888889,0,1
20005,SINGLE_LAYERED,POST,1234,39592,74,33,Origin,200,sql_injection,HKG,0.445946,0,0
20006,SINGLE_LAYERED,POST,1234,39592,74,33,Origin,200,sql_injection,HKG,0.445946,0,0


# Data Modeling

In [4]:
pp(models)

[(LogisticRegression(class_weight='balanced', max_iter=1000), 'lr_clf'),
 (SVC(class_weight='balanced'), 'svm_rbf_clf'),
 (KNeighborsClassifier(n_neighbors=3), 'knn3_clf'),
 (KNeighborsClassifier(), 'knn5_clf'),
 (KNeighborsClassifier(n_neighbors=7), 'knn7_clf'),
 (GaussianNB(), 'gnb_clf'),
 (DecisionTreeClassifier(class_weight='balanced'), 'dt_clf'),
 (RandomForestClassifier(class_weight='balanced', n_jobs=-1), 'rfc_clf'),
 (ExtraTreesClassifier(class_weight='balanced', n_jobs=-1), 'etc_clf'),
 (GradientBoostingClassifier(), 'gbc_clf'),
 (HistGradientBoostingClassifier(class_weight='balanced'), 'hgb_clf'),
 (XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=N

In [5]:
target2enc = {
    "Unknown": 0,
    "ssrf": 1,
    "xss": 2,
    "sql_injection":3
}

enc2target = {v: k for k, v in target2enc.items()}

In [6]:
target2enc, enc2target

({'Unknown': 0, 'ssrf': 1, 'xss': 2, 'sql_injection': 3},
 {0: 'Unknown', 1: 'ssrf', 2: 'xss', 3: 'sql_injection'})

In [7]:
num_cols = list(df.select_dtypes(include=np.number).columns)
cat_cols = list(df.select_dtypes(exclude=np.number).columns)
target = cat_cols.pop(-2)
X = df[num_cols + cat_cols]
y = df[target].map(target2enc).astype(int)

In [8]:
df_results = evaluate_models(X, y, models=models, num_cols=num_cols, cat_cols=cat_cols, n_splits=5)

Evaluating models:   0%|          | 0/12 [00:00<?, ?it/s]


=== Model: lr_clf ===
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        66
           1       1.00      0.95      0.98     19842
           2       0.07      0.77      0.12        26
           3       0.09      0.91      0.17        74

    accuracy                           0.95     20008
   macro avg       0.53      0.90      0.56     20008
weighted avg       0.99      0.95      0.97     20008

Confusion Matrix:
[[   65     0     0     1]
 [    4 18914   272   652]
 [    0     4    20     2]
 [    0     4     3    67]]

=== Model: svm_rbf_clf ===
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        66
           1       1.00      0.99      0.99     19842
           2       0.58      0.81      0.68        26
           3       0.23      0.88      0.36        74

    accuracy                           0.99     20008
   macro avg   

In [11]:
df_results.to_parquet("../data/results/model_eval_results.parquet")
df_results

Unnamed: 0,model,accuracy_mean,accuracy_std,f1_mean,f1_std,precision_mean,precision_std,recall_mean,recall_std
0,lr_clf,0.952918,0.013297,0.568486,0.03177,0.533859,0.02403,0.902557,0.050573
1,svm_rbf_clf,0.987655,0.002441,0.75445,0.037037,0.709457,0.038218,0.910862,0.059476
2,knn3_clf,0.998101,0.000752,0.873096,0.049559,0.9322,0.047834,0.838234,0.062584
3,knn5_clf,0.997601,0.000583,0.813783,0.055394,0.905617,0.069122,0.771078,0.061224
4,knn7_clf,0.997301,0.000332,0.785428,0.051316,0.896325,0.082672,0.736279,0.043414
5,gnb_clf,0.028539,0.004913,0.278526,0.00366,0.509024,0.001153,0.635029,0.0336
6,dt_clf,0.99885,0.0002,0.923342,0.012771,0.956012,0.041736,0.906115,0.033443
7,rfc_clf,0.999,0.000474,0.933094,0.040706,0.983107,0.020279,0.904511,0.066972
8,etc_clf,0.99875,0.000474,0.90842,0.038317,0.968439,0.03279,0.872832,0.050791
9,gbc_clf,0.997601,0.00128,0.849542,0.056357,0.915796,0.033807,0.82518,0.062199
