# Used libraries

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report

# Used algorithms

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Data

In [3]:
# We have to merge the dataset containing attack data and the dataset containing data under normal operation

path_1 = os.path.join(os.getcwd(), "../../datasets/WADI_attackdataLABLE.csv")
path_2 = os.path.join(os.getcwd(), "../../datasets/WADI_14days_new.csv")

df_1 = pd.read_csv(path_1, header = 1, sep = ',')
df_2 = pd.read_csv(path_2, header = 0, sep = ',')

In [4]:
# Row column is trivial, Date and Time columns are not relevant for testing

df_1 = df_1.drop(columns = ['Row ', 'Date ', 'Time'])
df_2 = df_2.drop(columns = ['Row', 'Date', 'Time'])

In [5]:
# Initially the normal operation dataset did not have an attack label feature so we have to add it ourselves

df_2['Attack LABLE (1:No Attack, -1:Attack)'] = 1

df = pd.concat([df_1, df_2], axis = 0)

# We are going to use target_names later for generating classification reports

target_names = ['Attack', 'Not attack']

In [6]:
# We also have to drop the following columns because they have only missing values

df = df.drop(columns = ['2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS'])

In [7]:
# We use a simple imputer for the rest of the missing values

imp = SimpleImputer(strategy="most_frequent")
df_prep = pd.DataFrame(imp.fit_transform(df))

In [8]:
# And we restore the initial index and columns

df_prep.columns = df.columns
df_prep.index = df.index
df = df_prep

# Training and testing algorithms:

In [9]:
X = df.drop('Attack LABLE (1:No Attack, -1:Attack)', axis=1)
y = df['Attack LABLE (1:No Attack, -1:Attack)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## AdaBoostClassifier

In [10]:
model_abclf = AdaBoostClassifier(n_estimators=50, algorithm='SAMME', random_state=42)
model_abclf.fit(X_train, y_train)

In [11]:
y_pred = model_abclf.predict(X_test)

In [12]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.8035714285714286, 'recall': 0.41389882473173223, 'f1-score': 0.5463743676222597, 'support': 1957.0}, 'Not attack': {'precision': 0.9939779594365428, 'recall': 0.9989552443567365, 'f1-score': 0.9964603865942077, 'support': 189518.0}, 'accuracy': 0.9929755842799322, 'macro avg': {'precision': 0.8987746940039857, 'recall': 0.7064270345442344, 'f1-score': 0.7714173771082337, 'support': 191475.0}, 'weighted avg': {'precision': 0.9920318798914166, 'recall': 0.9929755842799322, 'f1-score': 0.9918602124768132, 'support': 191475.0}}


## ExtraTreesClassifier

In [13]:
model_etclf = ExtraTreesClassifier(n_estimators=100, random_state=42)
model_etclf.fit(X_train, y_train)

In [14]:
y_pred = model_etclf.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.99846547314578, 'recall': 0.9974450689831375, 'f1-score': 0.9979550102249489, 'support': 1957.0}, 'Not attack': {'precision': 0.999973617560152, 'recall': 0.9999841703690414, 'f1-score': 0.9999788939367557, 'support': 189518.0}, 'accuracy': 0.9999582190886539, 'macro avg': {'precision': 0.999219545352966, 'recall': 0.9987146196760894, 'f1-score': 0.9989669520808523, 'support': 191475.0}, 'weighted avg': {'precision': 0.999958203335742, 'recall': 0.9999582190886539, 'f1-score': 0.9999582085186906, 'support': 191475.0}}


## GradientBoostingClassifier

In [16]:
model_gbclf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_gbclf.fit(X_train, y_train)

In [17]:
y_pred = model_gbclf.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.9977628635346756, 'recall': 0.911599386816556, 'f1-score': 0.9527369826435247, 'support': 1957.0}, 'Not attack': {'precision': 0.999087971236827, 'recall': 0.9999788938253886, 'f1-score': 0.9995332340027162, 'support': 189518.0}, 'accuracy': 0.9990755973364669, 'macro avg': {'precision': 0.9984254173857513, 'recall': 0.9557891403209723, 'f1-score': 0.9761351083231204, 'support': 191475.0}, 'weighted avg': {'precision': 0.9990744277675849, 'recall': 0.9990755973364669, 'f1-score': 0.9990549456417818, 'support': 191475.0}}
