# An example of how to test our IForestASD Implementation & Compare IForestASD against HSTrees

## Install skmultiflow if needed
You need to install git

In [1]:
try:
    import skmultiflow
except ImportError as e:
    print("scikit-multiflow package installation")
    !pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

## Importations and configurations

In [2]:
%matplotlib notebook
import matplotlib as plt
plt.interactive(True)
import functions
func = functions.Comparison()
import datetime

## General parameters for the evaluation

In [3]:
window_sizes = [100, 300, 500, 1000]
n_estimators = [10, 30, 50, 100]
anomaly_threshold = 0.5 #or 0.6?
max_sample = 50000 # We have gotten the size of the min dataset (Shuttle) to evaluate all dataset on the same basis.
n_wait = max_sample # The evaluation step size
metrics=['accuracy', 'f1', 'kappa', 'kappa_m', 'running_time','model_size'] # Used metric in the evaluation. Attention to use the metrics availlable in skmultiflow

## Test with generated dataset
Can be deleted because, it is just an test

In [4]:
dataset_name = "Generator"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.3 #Do the drift rate must be the same as the noise_percentage in the dataset?
#noise_percentage in SEAGenerator is the percentage of anomaly in the dataset? 
stream = func.get_dataset(dataset_name=dataset_name, classification_function=0,noise_percentage=0.7, random_state=1)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)

<IPython.core.display.Javascript object>

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 1 sample(s).
Evaluating...
 #################### [100%] [58.48s]
Processed samples: 500
Mean performance:
HSTrees - Accuracy     : 0.5471
HSTrees - Kappa        : 0.0834
HSTrees - Kappa M      : -0.0320
HSTrees - F1 score: 0.4910
HSTrees - Training time (s)  : 13.82
HSTrees - Testing time  (s)  : 0.52
HSTrees - Total time    (s)  : 14.34
HSTrees - Size (kB)          : 377603.8740
iForestASD - Accuracy     : 0.4429
iForestASD - Kappa        : 0.0592
iForestASD - Kappa M      : -0.2694
iForestASD - F1 score: 0.4283
iForestASD - Training time (s)  : 0.51
iForestASD - Testing time  (s)  : 43.59
iForestASD - Total time    (s)  : 44.10
iForestASD - Size (kB)          : 3611.7041

Please find evaluation results here results/Generator_2020-03-25 23:19:28.844935/result_for_WS100_NE50.csv


## Test with Shuttle dataset
Add the dataset description given all important informations

Dataset Name : Shuttle
Instances : 49097
Attributes : 9
Anomaly Percentage : 7.15%

In [None]:
dataset_name = "Shuttle"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 7.15 #Do the drift rate must be the same as the noise_percentage in the dataset?
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)

## Test with SMTP dataset
Add the dataset description given all important informations
   
Dataset Name : Smtp
Instances : 95156
Attributes : 3
Anomaly Percentage : 0.03%

In [None]:
dataset_name = "SMTP"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.03 #Do the drift rate must be the same as the noise_percentage in the dataset?
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)

## Test with ForestCover dataset
Add the dataset description given all important informations
   
Dataset Name : ForestCover
Instances : 286048
Attributes : 10
Anomaly Percentage : 0.96%

In [None]:
dataset_name = "ForestCover"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.96 #Do the drift rate must be the same as the noise_percentage in the dataset?
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)

## Test with HTTP dataset
Add the dataset description given all important informations

Dataset Name : Http
Instances : 567498
Attributes :3
Anomaly Percentage : 0.39%

In [None]:
dataset_name = "HTTP"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.39 #Do the drift rate must be the same as the noise_percentage in the dataset?
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)