# Used libraries

In [1]:
import os
import pandas as pd

from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

from sklearn.metrics import average_precision_score

# Used algorithms

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Data

In [3]:
# We have to merge the dataset containing attack data and the dataset containing data under normal operation

path_1 = os.path.join(os.getcwd(), "../../datasets/WADI_attackdataLABLE.csv")
path_2 = os.path.join(os.getcwd(), "../../datasets/WADI_14days_new.csv")

df_1 = pd.read_csv(path_1, header = 1, sep = ',')
df_2 = pd.read_csv(path_2, header = 0, sep = ',')

In [4]:
# Row column is trivial, Date and Time columns are not relevant for testing so we drop them

df_1 = df_1.drop(columns = ['Row ', 'Date ', 'Time'])
df_2 = df_2.drop(columns = ['Row', 'Date', 'Time'])

In [5]:
# Initially the normal operation dataset did not have an attack label feature so we have to add it ourselves

df_2['Attack LABLE (1:No Attack, -1:Attack)'] = 1

df = pd.concat([df_1, df_2], axis = 0)

In [6]:
# We also have to drop the following columns because they only have missing values

df = df.drop(columns = ['2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS'])

In [7]:
# We use a simple imputer for the rest of the missing values -> strategy = MEAN

imp = SimpleImputer(strategy="mean")
df_prep = pd.DataFrame(imp.fit_transform(df))

In [8]:
# And we restore the initial index and columns

df_prep.columns = df.columns
df_prep.index = df.index
df = df_prep

# Training and testing all algorithms for measuring Average Precision Score:

In [9]:
X = df.drop('Attack LABLE (1:No Attack, -1:Attack)', axis=1)
y = df['Attack LABLE (1:No Attack, -1:Attack)']

In [10]:
# VERY IMPORTANT, we reverse the tags so that the minority class (Attack) is considered for evaluation

y = y.replace({-1: 1, 1: 0})

## Define all classification models

In [11]:
classifiers = {
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42)
}

## Calculate the Average Precision Score for all trained models (the text label is misleading and was a mistake)

In [12]:
for clf_name, clf in classifiers.items():
    y_pred_proba = cross_val_predict(clf, X, y, cv=10, method='predict_proba', n_jobs=3)[:, 1]
    score = average_precision_score(y, y_pred_proba)
    print(f"{clf_name}: Precision-Recall AUC = {score:.4f}")
    print()

AdaBoost: Precision-Recall AUC = 0.0743

Bagging: Precision-Recall AUC = 0.0592

Decision Tree: Precision-Recall AUC = 0.0336

Random Forest: Precision-Recall AUC = 0.0911

Extra Trees: Precision-Recall AUC = 0.1169

Gradient Boosting: Precision-Recall AUC = 0.0943

HistGradientBoosting: Precision-Recall AUC = 0.0634

