# Used libraries

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report

# Used algorithms

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Data

In [3]:
# We have to merge the dataset containing attack data and the dataset containing data under normal operation

path_1 = os.path.join(os.getcwd(), "../../datasets/WADI_attackdataLABLE.csv")
path_2 = os.path.join(os.getcwd(), "../../datasets/WADI_14days_new.csv")

df_1 = pd.read_csv(path_1, header = 1, sep = ',')
df_2 = pd.read_csv(path_2, header = 0, sep = ',')

In [4]:
# Row column is trivial, Date and Time columns are not relevant for testing

df_1 = df_1.drop(columns = ['Row ', 'Date ', 'Time'])
df_2 = df_2.drop(columns = ['Row', 'Date', 'Time'])

In [5]:
# Initially the normal operation dataset did not have an attack label feature so we have to add it ourselves

df_2['Attack LABLE (1:No Attack, -1:Attack)'] = 1

df = pd.concat([df_1, df_2], axis = 0)

# We are going to use target_names later for generating classification reports

target_names = ['Attack', 'Not attack']

# Training and testing algorithms:

In [6]:
X = df.drop('Attack LABLE (1:No Attack, -1:Attack)', axis=1)
y = df['Attack LABLE (1:No Attack, -1:Attack)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## BaggingClassifier

In [7]:
model_bclf = BaggingClassifier(random_state=42)
model_bclf.fit(X_train, y_train)

In [8]:
y_pred = model_bclf.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.9989769820971867, 'recall': 0.99795605518651, 'f1-score': 0.9984662576687117, 'support': 1957.0}, 'Not attack': {'precision': 0.9999788940481216, 'recall': 0.9999894469126943, 'f1-score': 0.9999841704525667, 'support': 189518.0}, 'accuracy': 0.9999686643164905, 'macro avg': {'precision': 0.9994779380726542, 'recall': 0.9989727510496021, 'f1-score': 0.9992252140606392, 'support': 191475.0}, 'weighted avg': {'precision': 0.9999686538512919, 'recall': 0.9999686643164905, 'f1-score': 0.999968656389018, 'support': 191475.0}}


## DecisionTreeClassifier

In [10]:
model_dtclf = DecisionTreeClassifier(random_state=42)
model_dtclf.fit(X_train, y_train)

In [11]:
y_pred = model_dtclf.predict(X_test)

In [12]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.9989775051124744, 'recall': 0.9984670413898825, 'f1-score': 0.9987222080245336, 'support': 1957.0}, 'Not attack': {'precision': 0.9999841704525667, 'recall': 0.9999894469126943, 'f1-score': 0.9999868086756701, 'support': 189518.0}, 'accuracy': 0.9999738869304087, 'macro avg': {'precision': 0.9994808377825206, 'recall': 0.9992282441512884, 'f1-score': 0.9993545083501019, 'support': 191475.0}, 'weighted avg': {'precision': 0.9999738816729842, 'recall': 0.9999738869304087, 'f1-score': 0.9999738836281481, 'support': 191475.0}}


## HistGradientBoostingClassifier

In [13]:
model_hgbclf = HistGradientBoostingClassifier(random_state=42)
model_hgbclf.fit(X_train, y_train)

In [14]:
y_pred = model_hgbclf.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.9994882292732856, 'recall': 0.99795605518651, 'f1-score': 0.998721554589619, 'support': 1957.0}, 'Not attack': {'precision': 0.9999788941594863, 'recall': 0.9999947234563471, 'f1-score': 0.9999868087452742, 'support': 189518.0}, 'accuracy': 0.9999738869304087, 'macro avg': {'precision': 0.9997335617163859, 'recall': 0.9989753893214286, 'f1-score': 0.9993541816674466, 'support': 191475.0}, 'weighted avg': {'precision': 0.9999738792427488, 'recall': 0.9999738869304087, 'f1-score': 0.9999738770185077, 'support': 191475.0}}


## RandomForestClassifier

In [16]:
model_rfclf = RandomForestClassifier(random_state=42)
model_rfclf.fit(X_train, y_train)

In [18]:
y_pred = model_rfclf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred, target_names=target_names, output_dict=True))

{'Attack': {'precision': 0.9984670413898825, 'recall': 0.9984670413898825, 'f1-score': 0.9984670413898825, 'support': 1957.0}, 'Not attack': {'precision': 0.9999841703690414, 'recall': 0.9999841703690414, 'f1-score': 0.9999841703690414, 'support': 189518.0}, 'accuracy': 0.9999686643164905, 'macro avg': {'precision': 0.999225605879462, 'recall': 0.999225605879462, 'f1-score': 0.999225605879462, 'support': 191475.0}, 'weighted avg': {'precision': 0.9999686643164905, 'recall': 0.9999686643164905, 'f1-score': 0.9999686643164905, 'support': 191475.0}}


In [None]:
print(f'Rows with activity under normal operation: {df[df['Attack LABLE (1:No Attack, -1:Attack)'] == 1].shape[0]}')
print(f'Rows with activity under attack: {df[df['Attack LABLE (1:No Attack, -1:Attack)'] == -1].shape[0]}')