In [1]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyod



In [3]:
import os
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt
import random
from numpy import percentile
import sklearn
from sklearn.utils import shuffle

In [4]:
# For reproducibility
np.random.seed(12)
tf.random.set_seed(12)
random.seed(1)

In [5]:
dir_saved_data = "/content/drive/My Drive/ASHRAEData/"

In [6]:
## CAN BE CHANGED 

## change these lists as per model's input 
site_id= 0 

## window length 
seq_length = 24 

In [7]:
## Reading saved data Non-Anomalous
os.chdir(dir_saved_data)

## Reading training data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/train_data.pkl", 'rb') as f:
  Y_train_non_anom = pickle.load(f)

## Reading validation data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/val_data.pkl", 'rb') as f:
  Y_val_non_anom = pickle.load(f)

## Reading test data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/test_data.pkl", 'rb') as f:
  Y_test_non_anom = pickle.load(f)

In [8]:
## Reading saved data Anomalous
os.chdir(dir_saved_data)

## Reading training data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/train_data.pkl", 'rb') as f:
  Y_train_anom = pickle.load(f)

## Reading validation data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/val_data.pkl", 'rb') as f:
  Y_val_anom = pickle.load(f)

## Reading test data
with open("./Baseline_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/test_data.pkl", 'rb') as f:
  Y_test_anom = pickle.load(f)

In [9]:
def create_data_pyod(Y_train_non_anom, Y_train_anom, Y_val_non_anom, Y_val_anom, Y_test_non_anom,Y_test_anom):
  ## def to create data used for training and evaluation of BAseline PyOD models
  
  # Anomalous  - train data
  arr_1 = np.zeros((1,24))
  for i in range(len(Y_train_anom)):
    arr_1 = np.concatenate([arr_1, Y_train_anom[i].reshape(1, 24)], axis=0)
  arr_1 = np.delete(arr_1, (0), axis=0)

  # Non- Anomalous  - train data
  arr_0 = np.zeros((1,24))
  for i in range(len(Y_train_non_anom)):
    arr_0 = np.concatenate([arr_0, Y_train_non_anom[i].reshape(1, 24)], axis=0)
  arr_0 = np.delete(arr_0, (0), axis=0)

  X_train = np.concatenate([arr_0, arr_1], axis=0)

  Y_train = np.zeros((arr_0.shape[0])) 
  Y_train = np.concatenate([Y_train, np.ones((arr_1.shape[0]))], axis=0)

  # Anomalous  - val data
  val_arr_1 = np.zeros((1,24))
  for i in range(len(Y_val_anom)):
    val_arr_1 = np.concatenate([val_arr_1, Y_val_anom[i].reshape(1, 24)], axis=0)
  val_arr_1 = np.delete(val_arr_1, (0), axis=0)

  # Non- Anomalous  - val data
  val_arr_0 = np.zeros((1,24))
  for i in range(len(Y_val_non_anom)):
    val_arr_0 = np.concatenate([val_arr_0, Y_val_non_anom[i].reshape(1, 24)], axis=0)
  val_arr_0 = np.delete(val_arr_0, (0), axis=0)

  X_val = np.concatenate([val_arr_0, val_arr_1], axis=0)

  Y_val = np.zeros((val_arr_0.shape[0])) 
  Y_val = np.concatenate([Y_val, np.ones((val_arr_1.shape[0]))], axis=0)


  # Anomalous  - test data
  test_arr_1 = np.zeros((1,24))
  for i in range(len(Y_test_anom)):
    test_arr_1 = np.concatenate([test_arr_1, Y_test_anom[i].reshape(1, 24)], axis=0)
  test_arr_1 = np.delete(test_arr_1, (0), axis=0)

  # Non- Anomalous  - val data
  test_arr_0 = np.zeros((1,24))
  for i in range(len(Y_test_non_anom)):
    test_arr_0 = np.concatenate([test_arr_0, Y_test_non_anom[i].reshape(1, 24)], axis=0)
  test_arr_0 = np.delete(test_arr_0, (0), axis=0)

  X_test = np.concatenate([test_arr_0, test_arr_1], axis=0)

  Y_test = np.zeros((test_arr_0.shape[0])) 
  Y_test = np.concatenate([Y_test, np.ones((test_arr_1.shape[0]))], axis=0)

  return X_train, Y_train, X_val, Y_val, X_test, Y_test, arr_0, arr_1, val_arr_0, val_arr_1

In [10]:
## Loading Dataset
X_train, Y_train, X_val, Y_val, X_test, Y_test, arr_0, arr_1, val_arr_0, val_arr_1 = create_data_pyod(Y_train_non_anom, Y_train_anom, Y_val_non_anom, Y_val_anom, Y_test_non_anom,Y_test_anom)

In [11]:
## Shuffling the data 
x_train, y_train = shuffle(X_train, Y_train)
x_val, y_val = shuffle(X_val, Y_val)
x_test, y_test = shuffle(X_test, Y_test)

In [12]:
# supress warnings for clean output
import warnings
warnings.filterwarnings("ignore")

# Import all models
from pyod.utils import data
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA

In [13]:
## Define the number of inliers and outliers training data 
training_outliers_fraction = arr_1.shape[0] / (arr_0.shape[0] + arr_1.shape[0])

## Define the number of inliers and outliers validation data
outliers_fraction = val_arr_1.shape[0] / (val_arr_0.shape[0] + val_arr_1.shape[0])


clusters_separation = [0]

# initialize a set of detectors for LSCP
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
                 LOF(n_neighbors=50)]

random_state = 42

In [14]:
# Define outlier detection tools to be compared
classifiers = {
    'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=training_outliers_fraction,
              check_estimator=False, random_state=random_state),
    'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=training_outliers_fraction,
                       random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)': HBOS(
        contamination=training_outliers_fraction),
    'Isolation Forest': IForest(contamination=training_outliers_fraction,
                                random_state=random_state),
    'K Nearest Neighbors (KNN)': KNN(
        contamination=training_outliers_fraction),
    # 'Average KNN': KNN(method='mean',
    #                    contamination=training_outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=training_outliers_fraction),
    'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=training_outliers_fraction),
    'Minimum Covariance Determinant (MCD)': MCD(
        contamination=training_outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=training_outliers_fraction),
    'Principal Component Analysis (PCA)': PCA(
        contamination=training_outliers_fraction, random_state=random_state),
}


In [15]:
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Cluster-based Local Outlier Factor (CBLOF)
Model 2 Feature Bagging
Model 3 Histogram-base Outlier Detection (HBOS)
Model 4 Isolation Forest
Model 5 K Nearest Neighbors (KNN)
Model 6 Local Outlier Factor (LOF)
Model 7 Minimum Covariance Determinant (MCD)
Model 8 One-class SVM (OCSVM)
Model 9 Principal Component Analysis (PCA)


In [16]:
## Fit the models with the generated data and
## compare model performances

## Fit the model
for i, (clf_name, clf) in enumerate(classifiers.items()):
  print()
  print(i + 1, 'fitting', clf_name)
  # fit the train data and tag outliers
  clf.fit(x_train)

  # get the prediction labels and outlier scores of the training data
  y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
  y_train_scores = clf.decision_scores_  # raw outlier scores

  # evaluate and print the results
  print("\nOn Training Data:")
  data.evaluate_print(clf_name, y_train, y_train_scores)

  ## VALIDATION DATA 
  # get the prediction labels and outlier scores of the validation data
  y_val_scores = clf.decision_function(x_val)  # outlier scores

  # set threshold on the validation data 
  threshold = percentile(y_val_scores, 100 * outliers_fraction)
  print("threshold : ", threshold)
  # convet scores to label using the calculated labels 
  y_val_pred = (y_val_scores > threshold).astype('int')

  # evaluate and print the results on Validation data 
  print("\nOn Validation Data:")
  tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_val, y_val_pred).ravel()
  p = tp/(tp+fp)
  r = tp/(tp+fn)
  f1 = 2*p*r/(p+r)
  print("Precision : {}".format(p))
  print("Recall : {}".format(r))
  print("F1 score : {}".format(f1))

  ## TEST DATA 
  # get the prediction labels and outlier scores of the validation data
  y_test_scores = clf.decision_function(x_test)  # outlier scores

  # convet scores to label using the calculated labels  (threshold from validation data )
  y_test_pred = (y_test_scores > threshold).astype('int')

  # evaluate and print the results on Validation data 
  print("\nOn Test Data:")
  tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_test, y_test_pred).ravel()
  p = tp/(tp+fp)
  r = tp/(tp+fn)
  f1 = 2*p*r/(p+r)
  print("Precision : {}".format(p))
  print("Recall : {}".format(r))
  print("F1 score : {}".format(f1))


1 fitting Cluster-based Local Outlier Factor (CBLOF)

On Training Data:
Cluster-based Local Outlier Factor (CBLOF) ROC:0.2584, precision @ rank n:0.1235
threshold :  1.395656412343072

On Validation Data:
Precision : 0.608433734939759
Recall : 0.19825517993456926
F1 score : 0.299062345780556

On Test Data:
Precision : 0.8532918610679109
Recall : 0.20519852895343763
F1 score : 0.3308376463494297

2 fitting Feature Bagging

On Training Data:
Feature Bagging ROC:0.3674, precision @ rank n:0.1327
threshold :  1.1147436704923286

On Validation Data:
Precision : 0.607764390896921
Recall : 0.1980370774263904
F1 score : 0.29873334430004933

On Test Data:
Precision : 0.8647773279352227
Recall : 0.199713270585302
F1 score : 0.3244885558031193

3 fitting Histogram-base Outlier Detection (HBOS)

On Training Data:
Histogram-base Outlier Detection (HBOS) ROC:0.6175, precision @ rank n:0.0677
threshold :  45.7367826236061

On Validation Data:
Precision : 0.5740236148955495
Recall : 0.137840785169029