In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch

import copy
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from torch import nn, optim

import torch.nn.functional as F
from tqdm import tqdm

from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers 
from tensorflow.keras import optimizers 
from tensorflow.keras import backend as K
from sklearn import metrics

# scikit-multiflow
from skmultiflow.data import FileStream
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.evaluation import EvaluatePrequential

# scikit-multiflow drift detection
from skmultiflow.drift_detection import PageHinkley
from skmultiflow.drift_detection import ADWIN
from skmultiflow.drift_detection import KSWIN
from skmultiflow.drift_detection import HDDM_A
from skmultiflow.drift_detection import HDDM_W
from skmultiflow.drift_detection import DDM

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1c4f3cca5d0>

In [3]:
df = pd.read_csv('../swat_dataset_preprocessed', parse_dates=[" Timestamp"]).iloc[:,1:]
df

Unnamed: 0,Timestamp,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504,Normal/Attack
0,2015-12-28 10:00:00,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,Normal
1,2015-12-28 10:00:01,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,Normal
2,2015-12-28 10:00:02,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538,Normal
3,2015-12-28 10:00:03,2.534350,262.0161,328.6337,19.69076,956.8060,156.0882,264.5475,12.03538,Normal
4,2015-12-28 10:00:04,2.569260,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538,Normal
...,...,...,...,...,...,...,...,...,...,...
449914,2016-02-01 14:59:55,2.559972,168.0979,301.9226,20.39823,974.5498,145.6037,257.1136,14.80390,Normal
449915,2016-02-01 14:59:56,2.549082,168.0979,301.9226,20.39823,974.5898,145.6037,257.1136,14.80390,Normal
449916,2016-02-01 14:59:57,2.531467,168.0979,301.9226,20.33101,974.2695,145.5524,257.1136,14.80390,Normal
449917,2016-02-01 14:59:58,2.521218,168.0979,301.9226,20.29579,974.2294,145.5524,257.1136,14.80390,Normal


In [4]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Normal/Attack'])

In [5]:
label_encoder.classes_

array(['Attack', 'Normal'], dtype=object)

In [6]:
df['class'] = encoded_labels

In [7]:
df = df.drop(['Normal/Attack'],axis=1)
df.head()

Unnamed: 0,Timestamp,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504,class
0,2015-12-28 10:00:00,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
1,2015-12-28 10:00:01,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
2,2015-12-28 10:00:02,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538,1
3,2015-12-28 10:00:03,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538,1
4,2015-12-28 10:00:04,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538,1


In [8]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    row_data = dict(
        day_of_week=row[" Timestamp"].dayofweek,
        day_of_month=row[" Timestamp"].day,
        week_of_year=row[" Timestamp"].week,
        month=row[" Timestamp"].month,
        label = row['class'],
        FIT101 = row['FIT101'],
        AIT201 = row[' AIT201'],
        AIT203 = row['AIT203'],
        DPIT301 = row['DPIT301'],
        LIT301 = row['LIT301'],
        AIT402 = row['AIT402'],
        AIT503 = row['AIT503'],
        AIT504 = row['AIT504'],
    )
    
    rows.append(row_data)
    
features_df = pd.DataFrame(rows)   

100%|████████████████████████████████████████████████████████████████████████| 449919/449919 [01:29<00:00, 5051.62it/s]


In [9]:
features_df.shape

(449919, 13)

In [10]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,label,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504
0,0,28,53,12,1,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
1,0,28,53,12,1,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
2,0,28,53,12,1,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538
3,0,28,53,12,1,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538
4,0,28,53,12,1,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538


In [11]:
train_size = int(len(features_df) * 0.8)
test_size = len(features_df) - train_size
train, test = features_df.iloc[0:train_size], features_df.iloc[train_size:len(features_df)]

In [12]:
train.shape

(359935, 13)

In [13]:
train.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,label,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504
0,0,28,53,12,1,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
1,0,28,53,12,1,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
2,0,28,53,12,1,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538
3,0,28,53,12,1,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538
4,0,28,53,12,1,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538


In [14]:
df_train = train.reindex(['day_of_week','day_of_month','week_of_year','month','FIT101','AIT201','AIT203','DPIT301','LIT301','AIT402','AIT503','AIT504','label'], axis=1)

In [15]:
df_train.shape

(359935, 13)

In [16]:
df_test = test.reindex(['day_of_week','day_of_month','week_of_year','month','FIT101','AIT201','AIT203','DPIT301','LIT301','AIT402','AIT503','AIT504','label'], axis=1)

In [17]:
df_test.shape

(89984, 13)

In [18]:
df_train.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504,label
0,0,28,53,12,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
1,0,28,53,12,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
2,0,28,53,12,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538,1
3,0,28,53,12,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538,1
4,0,28,53,12,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538,1


In [19]:
#!pip install scikit-multiflow

In [20]:
df_train.to_csv('../df_train.csv') 

In [21]:
df_test.to_csv('../df_test.csv') 

In [22]:
train_file_stream = FileStream('../df_train.csv')
test_file_stream = FileStream('../df_test.csv')

In [23]:
# Adaptive Random Forest with Page Hinkley
arf_PageHinkley = AdaptiveRandomForestClassifier(drift_detection_method = PageHinkley(), warning_detection_method = PageHinkley())

# Adaptive Random Forest with ADWIN
arf_ADWIN = AdaptiveRandomForestClassifier(drift_detection_method = ADWIN(), warning_detection_method = ADWIN())

# Adaptive Random Forest with HDDM_W
arf_HDDM_W = AdaptiveRandomForestClassifier(drift_detection_method = HDDM_W(), warning_detection_method = HDDM_W())

In [24]:
# training data stream evaluation
evaluator_train = EvaluatePrequential(pretrain_size = 1000, max_samples = 359935, output_file = 'results_train2.csv', metrics = ['accuracy', 'f1'])
evaluator_train.evaluate(
    stream = train_file_stream, 
    model = [arf_PageHinkley, arf_ADWIN, arf_HDDM_W], 
    model_names = ['arf_PageHinkley', 'arf_ADWIN', 'arf_HDDM_W']
)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 1000 sample(s).
Evaluating...
 #################### [100%] [4545.28s]
Processed samples: 359935
Mean performance:
arf_PageHinkley - Accuracy     : 0.9990
arf_PageHinkley - F1 score: 0.9994
arf_ADWIN - Accuracy     : 0.9992
arf_ADWIN - F1 score: 0.9995
arf_HDDM_W - Accuracy     : 0.9992
arf_HDDM_W - F1 score: 0.9996


[AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,
                                drift_detection_method=PageHinkley(alpha=0.9999,
                                                                   delta=0.005,
                                                                   min_instances=30,
                                                                   threshold=50),
                                grace_period=50, lambda_value=6,
                                leaf_prediction='nba', max_byte_size=33554432,
                                max_features=4, memory_estimate_period=2000000,
                                n_estimators=10, nb_threshold=0,
                                no_preprune=False, nominal_attributes=None,
                                performance_metric='acc', random_state=None,
                                remove_poor_atts=False, split_confidence=0.01,
                                split_criterion='info_gain',
         

In [25]:
# testing data stream evaluation
evaluator_test = EvaluatePrequential(
    pretrain_size = 1000, 
    max_samples = 89984, 
    output_file = 'results_test2.csv', 
    metrics = ['accuracy', 'f1']
)
evaluator_test.evaluate(
    stream = test_file_stream, 
    model = [arf_PageHinkley, arf_ADWIN, arf_HDDM_W], 
    model_names = ['arf_PageHinkley', 'arf_ADWIN', 'arf_HDDM_W']
)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 1000 sample(s).
Evaluating...
 #################### [100%] [1175.88s]
Processed samples: 89984
Mean performance:
arf_PageHinkley - Accuracy     : 0.9986
arf_PageHinkley - F1 score: 0.9993
arf_ADWIN - Accuracy     : 0.9990
arf_ADWIN - F1 score: 0.9995
arf_HDDM_W - Accuracy     : 0.9989
arf_HDDM_W - F1 score: 0.9994


[AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,
                                drift_detection_method=PageHinkley(alpha=0.9999,
                                                                   delta=0.005,
                                                                   min_instances=30,
                                                                   threshold=50),
                                grace_period=50, lambda_value=6,
                                leaf_prediction='nba', max_byte_size=33554432,
                                max_features=4, memory_estimate_period=2000000,
                                n_estimators=10, nb_threshold=0,
                                no_preprune=False, nominal_attributes=None,
                                performance_metric='acc', random_state=None,
                                remove_poor_atts=False, split_confidence=0.01,
                                split_criterion='info_gain',
         

In [44]:
df_test_results = pd.read_csv('results_test2.csv', skiprows = 7)

In [30]:
#!pip install plotly

In [48]:
import plotly.express as px

# mean accuracies
fig = px.bar(x = df_test_results.iloc[449,1:7:2].keys(), y = df_test_results.iloc[449,1:7:2].values, range_y = [0.99, 1])
fig.update_layout(
    font_family="Times New Roman",
)

In [51]:
# mean f1
fig = px.bar(x = df_test_results.iloc[449,7:15:2].keys(), y = df_test_results.iloc[449,7:15:2].values, range_y = [0.99, 1])
fig.update_layout(
    font_family="Times New Roman",
)