In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import os
from omegaconf import OmegaConf
from pprint import pprint as pp
from icecream import ic
from typing import List

project = Path().resolve().parent
configs = Path(project) / "config"
data_paths = OmegaConf.load(configs / "data.yaml")
print(OmegaConf.to_yaml(data_paths))

raw: ../data/raw
interim: ../data/interim
processed: ../data/processed



In [3]:
data_int = Path(data_paths.processed) / "modsec_audit_train_v1_new.csv"
df = pd.read_csv(data_int, on_bad_lines="warn", engine="python")


In [4]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample


In [5]:
def clean_and_engineer(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # ---------- basic cleaning ----------
    df = df.drop(columns=["Unnamed: 0", "id", 'timestamp', "req_cf_ray"])
    # df["timestamp"] = pd.to_datetime(df["timestamp"],
    #                                  format="%d/%b/%Y:%H:%M:%S %z")
    #
    # # ---------- feature engineering ----------
    # ts = df["timestamp"]
    # df["ts_year"]       = ts.dt.year
    # df["ts_month"]      = ts.dt.month
    # df["ts_day"]        = ts.dt.day
    # df["ts_hour"]       = ts.dt.hour
    # df["ts_weekday"]    = ts.dt.weekday
    # df["ts_is_weekend"] = (df["ts_weekday"] >= 5).astype(int)

    # df["cf_datacenter"] = df["req_cf_ray"].str.extract(r"-([A-Z]{3})$")
    # df = df.drop(columns=["req_cf_ray"])

    for col in ["req_content_length", "resp_content_length"]:
        df[col] = df[col].fillna(0).astype(int)

    df["content_length_ratio"] = np.where(
        df["req_content_length"] == 0, 0,
        df["resp_content_length"] / df["req_content_length"]
    )

    q75_req  = df["req_content_length"].quantile(0.75)
    q75_resp = df["resp_content_length"].quantile(0.75)
    df["large_req"]  = (df["req_content_length"]  > q75_req ).astype(int)
    df["large_resp"] = (df["resp_content_length"] > q75_resp).astype(int)

    # ---------- tidy up categorical ----------
    cat_cols = ["layer_type", "method", "status_code", "target", "resp_vary"]
    for c in cat_cols:
        df[c] = df[c].fillna("Unknown").astype("category")
    return df


In [6]:
df_fe = clean_and_engineer(df)
df_fe

Unnamed: 0,layer_type,method,local_port,remote_port,req_content_length,resp_content_length,resp_vary,status_code,target,content_length_ratio,large_req,large_resp
0,Unknown,GET,1234,37530,0,50,Origin,401,ssrf,0.000000,0,0
1,SINGLE_LAYERED,POST,1234,37562,86,33,Origin,200,sql_injection,0.383721,0,0
2,SINGLE_LAYERED,POST,1234,37562,86,33,Origin,200,sql_injection,0.383721,0,0
3,SINGLE_LAYERED,POST,1234,37562,74,62,Origin,401,sql_injection,0.837838,0,1
4,SINGLE_LAYERED,POST,1234,37562,77,64,Origin,403,sql_injection,0.831169,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
406,SINGLE_LAYERED,POST,1234,45610,87,62,Origin,401,sql_injection,0.712644,0,1
407,SINGLE_LAYERED,POST,1234,45610,86,64,Origin,403,sql_injection,0.744186,0,1
408,SINGLE_LAYERED,POST,1234,45610,86,33,Origin,200,sql_injection,0.383721,0,0
409,SINGLE_LAYERED,POST,1234,45610,86,33,Origin,200,sql_injection,0.383721,0,0


In [7]:
df_fe.to_parquet(Path(data_paths.processed) / "modsec_audit_train_v1_prepared_new.parquet", index=False)
df_fe.to_csv(Path(data_paths.processed) / "modsec_audit_train_v1_prepared_new.csv", index=False)
