In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Evaluation

Article Source: A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data 
Goldstein M, Uchida S (2016) A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLOS ONE 11(4): e0152173. https://doi.org/10.1371/journal.pone.0152173

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OPQMVF

I'm going to test the following dataset:
1. Pen-Global
1. Pen-Local
1. Speech
1. Aloi

On the following metrics: ROC-AUC

## Pen-Global Dataset

Paper version: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/OPQMVF/KQYDN9&version=1.0

Original UCI: https://archive.ics.uci.edu/ml/datasets/Pen-Based%20Recognition%20of%20Handwritten%20Digits

In [None]:
input_names = []

for i in range(8):
    input_names.append(f'x{i}')
    input_names.append(f'y{i}')

data = pd.read_csv('/Users/maxfrax/Downloads/pen-global-unsupervised-ad.csv', names=input_names+['outlier'])
data = data.sample(frac=1)

In [None]:
X = data[input_names]
y = data['outlier'] == 'o'

y = [-1 if v else 1 for v in y]


### Normalization

In the paper the declare they scale all their datasets with MinMax (Normalization section)

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(X, columns=input_names)

Let's plot the first number to show we understood the dataset correctly. As we can see the plot shows the number 8, which is expected since the normal class is just the number 8.

In [None]:
xs = []
ys = []

for i, v in enumerate(data[input_names][0:1].values[0]):
    if i % 2 == 0:
        xs.append(v)
    else:
        ys.append(v)
        
plt.plot(xs, ys)
plt.show()

data[0:1]['outlier']

### Replicating: One Class SVM

I'm going to use sklearn implementation https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html since both the paper and sklearn refer to:

Estimating the support of a high-dimensional distribution Schölkopf, Bernhard, et al. Neural computation 13.7 (2001): 1443-1471.

In the paper they use OneClassSVM in an unsupervised setting.

> In the unsupervised anomaly detection scenario, the one-class SVM is trained using the dataset and afterwards, each instance in the dataset is scored by a normalized distance to the determined decision boundary [40]. The parameter ν needs to be set to a value lager than zero such that the contained anomalies are correctly handled by a soft-margin

The hyper parameters tested are 0.2 < v < 0.8 and automatic tuning for the Gaussian kernel.
By automatic tuning I guess they talk about trial and error. In fact they declare it as a computationally hard task.

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import uniform
from sklearn.model_selection import PredefinedSplit

# Gamma is a free parameter. It ranges from 0 to infinity
# Predefined split to train on the whole dataset and test on the whole dataset
distributions = dict(nu=uniform(loc=0.2, scale=0.8), gamma=uniform(loc=0, scale=1000))

test_fold = [0 if v < len(X) else 1 for v in range(len(X) * 2)]

search = RandomizedSearchCV(OneClassSVM(), distributions, cv=PredefinedSplit(test_fold=test_fold), refit=True, n_iter=1000, scoring='roc_auc', n_jobs=-1, error_score='raise', verbose=2)

res = search.fit(pd.concat([X,X]), y+y)

clf = res.best_estimator_

print(res.best_index_)
print(res.best_score_)
print(res.best_params_)


y_pred = clf.predict(X)

In [None]:
conf_mat = confusion_matrix(y, y_pred)
ConfusionMatrixDisplay(conf_mat).plot()
plt.show()

display(conf_mat)

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_predictions(y, clf.score_samples(X))

plt.show()

In [None]:
cv_results = pd.DataFrame(res.cv_results_)

cv_results.sort_values('rank_test_score')

### My Algorithm: BSVClassifier

In [None]:
import sys
import os

sys.path.append(os.path.abspath('..'))

from flod.classifiers.bsvclassifier import BSVClassifier

In [None]:
distributions = {'c':uniform(loc=0.2, scale=0.8),'q':uniform(loc=0, scale=1)}

search = RandomizedSearchCV(BSVClassifier(normal_class_label=1, outlier_class_label=-1), distributions, cv=PredefinedSplit(test_fold=test_fold), refit=True, scoring='roc_auc', error_score='raise', verbose=2, n_jobs=4, n_iter=10)
res = search.fit(pd.concat([X,X]).to_numpy(), y+y)

clf = res.best_estimator_

print(res.best_index_)
print(res.best_score_)
print(res.best_params_)

y_pred = clf.predict(X.to_numpy())

In [None]:
conf_mat = confusion_matrix(y, y_pred)
ConfusionMatrixDisplay(conf_mat).plot()
plt.show()

display(conf_mat)

In [None]:
RocCurveDisplay.from_predictions(y, clf.score_samples(X.to_numpy()))

plt.show()

In [None]:
cv_results = pd.DataFrame(res.cv_results_)

cv_results.sort_values('rank_test_score')

### Federated Learning: BSVClassifier

In [None]:
from flod.classifiers.federatedbsvclassifier import FederatedBSVClassifier
# global_combine should use auc as metric as above