### Import libraries

In [1]:
import time
import json
from capymoa.stream import NumpyStream
from src import iids_util
from typing import List, Dict, Union, Literal
from base import util
import pandas as pd

ImportError: cannot import name 'iids_util' from 'src' (/home/iwan/p3apps/p312/ToN_IoT-based-Industrial-IDS/src/__init__.py)

### Loading sample datasets

In [None]:
feature, target, header = iids_util.loading_edge_dataset(
    base_device='modbus', 
    all_in_fusion=True,
    load_all=False,
    low_memory=False,
    sample_size=.1,
    random_seed=80)

scaled = iids_util.online_normalization(data=feature,
                                        window_size=60,
                                        scaler_model='StandardScaler')

target = iids_util.map_as_binary_class(ntarget=target,
                                       class_0='normal')

feature dtype:  float64


### Make dataset as streaming

In [3]:

stream_cls = NumpyStream(X=scaled, y=target)


### Draws simulation

In [None]:
# Function for Classifier Models
def run_classifier(stream_cls: NumpyStream,
                   model: str,
                   random_seed: int =1,
                   time_limit: int =120):
    
    learner = iids_util.load_classifier_model(stream=stream_cls,
                                              method=model,
                                              random_seed=random_seed)
    
    stream_cls.restart()
    instance_seen = 0 # counter for evaluated instances
    err_instances = 0
    class_0_count = 0 # counter for predicted class as 0
    class_1_count = 0 # counter for predicted class as 1
    y_true = [] # Actual label from groundtruth
    y_pred = [] # Predicted label by model
    start = time.time() # runtime started at the beginning of the test-then-train loops

    while stream_cls.has_more_instances():
        curr_ins = stream_cls.next_instance()
        try:
            # test
            predict = learner.predict(curr_ins)
            # If model predict None, then always predict majority class
            if class_1_count > class_0_count:
                majority_class = 1
            else:
                majority_class = 0

            if predict == None:
                predict = majority_class
            
            # update majority class counter
            if predict == 0:
                class_0_count += 1
            elif predict == 1:
                class_1_count += 1

            # evaluator.update(curr_ins.y_index, predict)
            y_true.append(curr_ins.y_index)
            y_pred.append(predict)
            # train
            learner.train(curr_ins)
            instance_seen += 1
        except:
            print("Error Instance:", curr_ins, end='\r', flush=True)
            err_instances += 1

        if (instance_seen % 100 == 0) or (stream_cls.has_more_instances() == False):
            meter = instance_seen / stream_cls._len
            msg = util.progress_meter(progress=meter)
            print(f"{msg}. {model}: instance_seen:{instance_seen:,}. Error instances: {err_instances:,}", end='\r', flush=True)

        # check if run time is too long
        runtime = time.time() - start
        if runtime > time_limit:
            evaluator = iids_util.evaluation_metrics(y_pred=y_pred, y_true=y_true)
            evaluator.update({'instance_seen': instance_seen})
            evaluator.update({'runtime': round(runtime,3)})
            return evaluator

    runtime = time.time() - start
    evaluator = iids_util.evaluation_metrics(y_pred=y_pred, y_true=y_true)
    evaluator.update({'instance_seen': instance_seen})
    evaluator.update({'runtime': round(runtime,3)})

    return evaluator

# Function for Anomaly Detector
def run_detector(stream_cls: NumpyStream,
                 model: str,
                 random_seed: int =1,
                 time_limit: int =120):
    learner = iids_util.load_anomaly_model(stream=stream_cls,
                                           method=model,
                                           random_seed=random_seed)
    stream_cls.restart()
    instance_seen = 0 # counter for evaluated instances
    err_instances = 0
    y_true = [] # Actual label from groundtruth
    y_pred = [] # Predicted label by model
    start = time.time() # runtime started at the beginning of the test-then-train loops
    while stream_cls.has_more_instances():
        try:
            curr_ins = stream_cls.next_instance()
            # test
            score = learner.score_instance(curr_ins)
            y_scores = [score, score, score]
            y_models = iids_util.proba_prediction_rules(nscore=y_scores)
            y_predict = iids_util.voting_decision(npredicts=y_models)
            # train
            learner.train(curr_ins)
            # update results
            y_true.append(curr_ins.y_index)
            y_pred.append(y_predict)
            instance_seen += 1
        except:
            print("Error Instance:", curr_ins, end='\r', flush=True)
            err_instances += 1
        
        if (instance_seen % 100 == 0) or (stream_cls.has_more_instances() == False):
            meter = instance_seen / stream_cls._len
            msg = util.progress_meter(progress=meter)
            print(f"{msg}. {model}: instance_seen:{instance_seen:,}. Error Instances: {err_instances:,}", end='\r', flush=True)

        # check if run time is too long
        runtime = time.time() - start
        if runtime > time_limit:
            evaluator = iids_util.evaluation_metrics(y_pred=y_pred, y_true=y_true)
            evaluator.update({'instance_seen': instance_seen})
            evaluator.update({'runtime': round(runtime,3)})
            return evaluator
        
    runtime = time.time() - start
    evaluator = iids_util.evaluation_metrics(y_pred=y_pred, y_true=y_true)
    evaluator.update({'instance_seen': instance_seen})
    evaluator.update({'runtime': round(runtime,3)})
    
    return evaluator

### Run simulations - Classifier Models
Available models were inherit from CapyMOA.classifier

In [7]:
classifier_models = ["AdaptiveRandomForestClassifier", "DynamicWeightedMajority", "EFDT",
          "HoeffdingAdaptiveTree", "KNN", "LeveragingBagging", "NaiveBayes",
          "OnlineAdwinBagging", "OnlineBagging", "OnlineSmoothBoost",
          "OzaBoost", "PassiveAggressiveClassifier", "SGDClassifier",
          "StreamingGradientBoostedTrees", "StreamingRandomPatches",
          "HoeffdingTree"]

classifier_evaluator = {}

for model in classifier_models:
    evaluator = run_classifier(stream_cls=stream_cls,
                               model=model,
                               random_seed=80,
                               time_limit=120)
    
    classifier_evaluator.update({model: evaluator})
    del evaluator

eval_fname = f'output/Classifier_Evaluation_Table.json'
with open(eval_fname, 'w') as file:
    json.dump(classifier_evaluator, file)

c_table = pd.DataFrame()
for model in classifier_evaluator.keys():
    for metrics in classifier_evaluator.get(model).keys():
        c_table.loc[model, metrics] = classifier_evaluator.get(model).get(metrics)

c_table = c_table.sort_values('MCC', ascending=False).reset_index(names='Classifier')
c_table

|||||||||||||||||||||||||      | : 80.42%. EFDT: instance_seen:22,600nstance_seen:28,103:28,103

17800
19400
21600


|||||||||||||||||||||||||||||||| : 100.0%. EFDT: instance_seen:28,103

23400
25000


|||||||||||||||||||||||||||||||| : 100.0%. NaiveBayes: instance_seen:28,103:28,103,103

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


|                              | : 1.42%. PassiveAggressiveClassifier: instance_seen:400



|||||||||||||||||||||||||||||||| : 100.0%. HoeffdingTree: instance_seen:28,103en:28,103:28,103

Unnamed: 0,Classifier,Accuracy,Kappa,F1 score,MCC,Precision,Recall,instance_seen,runtime
0,AdaptiveRandomForestClassifier,98.883,0.97,97.584,0.97,96.574,98.616,28103.0,23.494
1,StreamingRandomPatches,98.829,0.97,97.461,0.97,96.751,98.181,28103.0,39.06
2,PassiveAggressiveClassifier,98.626,0.96,96.999,0.96,96.984,97.014,28103.0,12.042
3,LeveragingBagging,98.114,0.95,95.922,0.95,94.929,96.937,28103.0,17.417
4,HoeffdingAdaptiveTree,97.619,0.93,94.938,0.93,92.468,97.543,28103.0,1.457
5,OzaBoost,97.306,0.92,94.135,0.92,93.808,94.464,28103.0,2.065
6,StreamingGradientBoostedTrees,97.128,0.92,93.716,0.92,93.855,93.578,28103.0,54.721
7,SGDClassifier,96.958,0.91,93.253,0.91,94.665,91.883,28103.0,12.055
8,DynamicWeightedMajority,96.612,0.9,92.401,0.9,94.932,90.002,28103.0,1.267
9,OnlineAdwinBagging,95.926,0.88,90.852,0.88,93.428,88.415,28103.0,9.296


### Run simulations - Anomaly Models
Available models were inherit from CapyMOA.anomaly

In [11]:
anomaly_models = ["Autoencoder","HalfSpaceTrees", "OnlineIsolationForest",
          "StreamRHF", "StreamingIsolationForest", "RobustRandomCutForest",
          "AdaptiveIsolationForest"]

detector_evaluator = {}
stream_cls.restart()

for model in anomaly_models:
    evaluator = run_detector(stream_cls=stream_cls,
                             model=model,
                             random_seed=80,
                             time_limit=120)
    
    detector_evaluator.update({model: evaluator})
    del evaluator

eval_fname = f'output/Anomaly_Evaluation_Table.json'
with open(eval_fname, 'w') as file:
    json.dump(detector_evaluator, file)

a_table = pd.DataFrame()
for model in detector_evaluator.keys():
    for metrics in detector_evaluator.get(model).keys():
        a_table.loc[model, metrics] = detector_evaluator.get(model).get(metrics)

a_table = a_table.sort_values('MCC', ascending=False).reset_index(names='Detector')
a_table

||||||                         | : 18.86%. OnlineIsolationForest: instance_seen:5,300

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


|                              | : 0.71%. StreamingIsolationForest: instance_seen:200

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Error Instance: LabeledInstance( : 2.85%. StreamingIsolationForest: instance_seen:800
    Schema(No_Name),
    x=[ 1.266 -1.342 -0.505 ... -2.127 -0.502 -0.874],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[-0.759  1.557 -0.56  ... -0.638  0.718 -0.057],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[ 1.636 -0.254 -0.956 ...  0.846 -0.331 -0.965],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[-0.189  0.519 -0.798 ...  0.891  0.278 -1.33 ],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[-0.602 -0.514 -0.429 ...  0.845  0.278 -1.141],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[-0.26   0.826 -1.315 ...  0.714  0.811 -1.141],
    y_index=0,
    y_label='0'
)
Error Instance: LabeledInstance(
    Schema(No_Name),
    x=[-1.468  1.329  0.97  ...  0.712  1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Error Instance: LabeledInstance( : 38.07%. AdaptiveIsolationForest: instance_seen:10,700
    Schema(No_Name),
    x=[-0.684 -0.585  1.286 ...  0.692 -0.288 -1.885],
    y_index=0,
    y_label='0'
)
|||||||||||||||||||||||||||||| | : 100.0%. AdaptiveIsolationForest: instance_seen:28,102

Unnamed: 0,Detector,Accuracy,Kappa,F1 score,MCC,Precision,Recall,instance_seen,runtime
0,HalfSpaceTrees,35.95,0.09,41.676,0.21,26.323,100.0,28103.0,3.39
1,StreamingIsolationForest,76.704,0.08,14.577,0.11,45.477,8.679,28073.0,109.761
2,AdaptiveIsolationForest,65.607,0.06,28.834,0.06,27.385,30.446,28102.0,14.726
3,OnlineIsolationForest,99.381,0.0,0.0,0.0,0.0,0.0,5328.0,120.001
4,StreamRHF,72.965,0.0,0.0,0.0,0.0,0.0,344.0,120.015
5,RobustRandomCutForest,0.085,0.0,0.0,0.0,0.0,0.0,2364.0,120.011
6,Autoencoder,51.375,-0.24,8.258,-0.24,7.266,9.563,28103.0,6.913
