# Used libraries

In [1]:
import os
import pandas as pd

from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Used algorithms

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Data

In [3]:
# We have to merge the dataset containing attack data and the dataset containing data under normal operation

path_1 = os.path.join(os.getcwd(), "../../datasets/WADI_attackdataLABLE.csv")
path_2 = os.path.join(os.getcwd(), "../../datasets/WADI_14days_new.csv")

df_1 = pd.read_csv(path_1, header = 1, sep = ',')
df_2 = pd.read_csv(path_2, header = 0, sep = ',')

In [4]:
# Row column is trivial, Date and Time columns are not relevant for testing so we drop them

df_1 = df_1.drop(columns = ['Row ', 'Date ', 'Time'])
df_2 = df_2.drop(columns = ['Row', 'Date', 'Time'])

In [5]:
# Initially the normal operation dataset did not have an attack label feature so we have to add it ourselves

df_2['Attack LABLE (1:No Attack, -1:Attack)'] = 1

df = pd.concat([df_1, df_2], axis = 0)

In [6]:
# We also have to drop the following columns because they only have missing values

df = df.drop(columns = ['2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS'])

In [7]:
# We use the fillna pandas method with a forward filling strategy for the rest of the missing values -> strategy = LAST VALUE ABOVE

df.fillna(method='ffill', inplace=True)

In [9]:
# Finally, we define a custom dictionary for all the scoring functions that we will use in evaluating algorithms

scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score)
          }

# Training and testing algorithms:

In [10]:
X = df.drop('Attack LABLE (1:No Attack, -1:Attack)', axis=1)
y = df['Attack LABLE (1:No Attack, -1:Attack)']

## AdaBoostClassifier

In [11]:
model_abclf = AdaBoostClassifier(algorithm='SAMME', random_state=42)
scores = cross_validate(model_abclf, X, y, cv=10, scoring=scoring)

In [12]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

Fit_time: 214.4625 ± 4.0250

Score_time: 1.7773 ± 0.1254

Test_accuracy: 0.9640 ± 0.0655

Test_precision: 0.9914 ± 0.0022

Test_recall: 0.9722 ± 0.0681

Test_f1: 0.9803 ± 0.0373



## BaggingClassifier

In [13]:
model_bclf = BaggingClassifier(random_state=42)
scores = cross_validate(model_bclf, X, y, cv=10, scoring=scoring)

In [14]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

Fit_time: 354.3228 ± 59.6237

Score_time: 0.7812 ± 0.0814

Test_accuracy: 0.9399 ± 0.1050

Test_precision: 0.9930 ± 0.0028

Test_recall: 0.9459 ± 0.1064

Test_f1: 0.9655 ± 0.0625



## DecisionTreeClassifier

In [15]:
model_dtclf = DecisionTreeClassifier(random_state=42)
scores = cross_validate(model_dtclf, X, y, cv=10, scoring=scoring)

In [16]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

Fit_time: 50.9079 ± 11.8715

Score_time: 0.1404 ± 0.0084

Test_accuracy: 0.9418 ± 0.1001

Test_precision: 0.9933 ± 0.0028

Test_recall: 0.9475 ± 0.1010

Test_f1: 0.9669 ± 0.0590



## RandomForestClassifier

In [17]:
model_rfclf = RandomForestClassifier(random_state=42)
scores = cross_validate(model_dtclf, X, y, cv=10, scoring=scoring)

In [18]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

Fit_time: 50.7097 ± 11.6313

Score_time: 0.1358 ± 0.0023

Test_accuracy: 0.9418 ± 0.1001

Test_precision: 0.9933 ± 0.0028

Test_recall: 0.9475 ± 0.1010

Test_f1: 0.9669 ± 0.0590



## ExtraTreesClassifier

In [19]:
model_etclf = ExtraTreesClassifier(random_state=42)
scores = cross_validate(model_etclf, X, y, cv=10, scoring=scoring)

In [20]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

Fit_time: 40.8681 ± 1.7544

Score_time: 0.4043 ± 0.0201

Test_accuracy: 0.9734 ± 0.0434

Test_precision: 0.9943 ± 0.0028

Test_recall: 0.9788 ± 0.0440

Test_f1: 0.9860 ± 0.0233



## GradientBoostingClassifier

In [None]:
model_gbclf = GradientBoostingClassifier(random_state=42)
scores = cross_validate(model_gbclf, X, y, cv=10, scoring=scoring)

In [None]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()

## HistGradientBoostingClassifier

In [None]:
model_hgbclf = HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(model_hgbclf, X, y, cv=10, scoring=scoring)

In [None]:
for score_name, score_values in scores.items():
    print(f"{score_name.capitalize()}: {score_values.mean():.4f} ± {score_values.std():.4f}")
    print()