## Half Space Trees implementation using Scikit-multiflow

In [6]:
from skmultiflow.data import DataStream
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.anomaly_detection import HalfSpaceTrees
from sklearn import preprocessing
import pandas as pd
import datetime

## Setup Half-Space Trees estimator with default params
- Reducing the number of estimators could increase the speed of training with no accuracy impact if the dataset is relatively small
- Increasing the number of window-size could increase the speed of training
- The documentation specifies that a good setting for size_limit is 0.1 * window_size

In [7]:
# Setup Half-Space Trees estimator with default values
HSTrees_model = HalfSpaceTrees(n_estimators=25, window_size=250,
                               depth=15, size_limit=50, anomaly_threshold=0.5
                               )

## A formating function that can be improved

In [8]:
def format_dataset(dataset):
    for col in dataset.columns:
        if dataset[col].dtype == 'object':        
            le = preprocessing.LabelEncoder()
            dataset[col].fillna("Null", inplace=True)
            dataset[col] = le.fit_transform(dataset[col])
        else:
            dataset[col].fillna(0, inplace=True)
    return dataset

## The training + testing phase
The chunksize tries to simulate a continuous stream of data and offer a stable memory usage which is not achievable using isolation forest for example.


In [9]:
# Simulating a data stream
start = datetime.datetime.now()
CHUNK = 10000
detected_anomalies = 0
n_samples = 0

for data in pd.read_csv("continuous_factory_process.csv", chunksize=CHUNK):
    data = format_dataset(data)
    stream = DataStream(data)
    while stream.has_more_samples():
        X, y = stream.next_sample()
        prediction = HSTrees_model.predict(X)
        if prediction[0] == 1:
            detected_anomalies += 1
        HSTrees_model.partial_fit(X,y)
        n_samples += 1
end = datetime.datetime.now()
print(end - start)
print('Half-Space Trees detected {} anomalies out of {} samples'.format(detected_anomalies,n_samples))


0:00:34.494003
Half-Space Trees detected 882 anomalies out of 14088 samples
