In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colormaps
import numpy as np

plt.rcParams["figure.figsize"] = (10, 10)

In [None]:
VERBOSE = 0
N_JOBS = 4

In [None]:
import sys
import os

PATH_FLOD = "/Users/maxfrax/Desktop/FederatedLearningOutlierDetection"

sys.path.append(os.path.abspath(PATH_FLOD))

from flod.classifiers.bsvclassifier import BSVClassifier

# Evaluation

Article Source: A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data 
Goldstein M, Uchida S (2016) A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLOS ONE 11(4): e0152173. https://doi.org/10.1371/journal.pone.0152173

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OPQMVF

I'm going to test the following dataset:
1. Pen-Global
1. Pen-Local
1. Speech
1. Aloi

On the following metrics: ROC-AUC

## Pen-Global Dataset

Paper version: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/OPQMVF/KQYDN9&version=1.0

Original UCI: https://archive.ics.uci.edu/ml/datasets/Pen-Based%20Recognition%20of%20Handwritten%20Digits

In [None]:
input_names = []

for i in range(8):
    input_names.append(f'x{i}')
    input_names.append(f'y{i}')

data = pd.read_csv('/Users/maxfrax/Downloads/pen-global-unsupervised-ad.csv', names=input_names+['outlier'])
data = data.sample(frac=1)

In [None]:
data

In [None]:
X = data[input_names]
y = data['outlier'] == 'o'

y = np.array([-1 if v else 1 for v in y])

### Dimensionality Reduction

In [None]:
from sklearn.manifold import TSNE

X = TSNE(n_components=2, learning_rate='auto', init='pca').fit_transform(X)

### Normalization

In the paper the declare they scale all their datasets with MinMax (Normalization section)

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(X)

In [None]:
X.shape

Plot all points after dimensionality reduction:

In [None]:
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==-1,0], X[y==-1,1], marker='x')
plt.show()

### Replicating: One Class SVM

I'm going to use sklearn implementation https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html since both the paper and sklearn refer to:

Estimating the support of a high-dimensional distribution Schölkopf, Bernhard, et al. Neural computation 13.7 (2001): 1443-1471.

In the paper they use OneClassSVM in an unsupervised setting.

> In the unsupervised anomaly detection scenario, the one-class SVM is trained using the dataset and afterwards, each instance in the dataset is scored by a normalized distance to the determined decision boundary [40]. The parameter ν needs to be set to a value lager than zero such that the contained anomalies are correctly handled by a soft-margin

The hyper parameters tested are 0.2 < v < 0.8 and automatic tuning for the Gaussian kernel.
By automatic tuning I guess they talk about trial and error. In fact they declare it as a computationally hard task.

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import uniform
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

In [None]:
def plot_clf(clf, ix, X=None, y=None):

    color = colormaps['tab10'](ix)

    if X is None or y is None:
        X = clf.X_
        y = clf.y_ 

    gx = np.linspace(0, 1)
    gy = np.linspace(0, 1)
    gX, gY = np.meshgrid(gx, gy)
    zs = np.array([clf._compute_r(np.array([x, y])) for x, y in zip(np.ravel(gX), np.ravel(gY))])
    gZ = zs.reshape(gX.shape)
    membership_contour = plt.contour(gX, gY, gZ, levels=(clf.radius_, ), colors=[color])
    plt.clabel(membership_contour, inline=1)

    plt.scatter(X[y==1,0], X[y==1,1],  color=color, label=str(ix))
    plt.scatter(X[y==-1,0], X[y==-1,1], marker='x', color=color, label=str(ix))

In [None]:
def svm_experiment(classifier, distributions, metric: str):
    print(f'One Class Experiment: {metric}')

    # Gamma is a free parameter. It ranges from 0 to infinity
    # Predefined split to train on the whole dataset and test on the whole dataset
    test_fold = [0 if v < len(X) else 1 for v in range(len(X) * 2)]

    search = RandomizedSearchCV(classifier, distributions, cv=PredefinedSplit(test_fold=test_fold), refit=True, n_iter=10, scoring='average_precision', n_jobs=N_JOBS, error_score='raise', verbose=VERBOSE)

    res = search.fit(np.concatenate([X,X]), np.concatenate([y,y]))

    clf = res.best_estimator_

    print(res.best_index_)
    print(res.best_score_)
    print(res.best_params_)


    y_pred = clf.predict(X)

    conf_mat = confusion_matrix(y, y_pred)
    ConfusionMatrixDisplay(conf_mat).plot()
    plt.show()

    display(conf_mat)

    RocCurveDisplay.from_predictions(y, clf.score_samples(X))

    plt.show()

    PrecisionRecallDisplay.from_predictions(y, clf.score_samples(X))

    plt.show()

    cv_results = pd.DataFrame(res.cv_results_)
    cv_results.sort_values('rank_test_score')

    print(f"Average Score: {np.average(cv_results['mean_test_score'])}")
    print(f"STD Score: {np.std(cv_results['mean_test_score'])}")

    if isinstance(clf, BSVClassifier):
        plot_clf(clf, 0)

In [None]:
classifier = OneClassSVM(kernel='rbf')
distributions = dict(nu=uniform(loc=0.2, scale=0.8), gamma=uniform(loc=0, scale=1))

svm_experiment(classifier, distributions, 'roc_auc')
svm_experiment(classifier, distributions, 'average_precision')

### My Algorithm: BSVClassifier

In [None]:
classifier = BSVClassifier(normal_class_label=1, outlier_class_label=-1)
distributions = {'c':uniform(loc=0.2, scale=0.8),'q':uniform(loc=0, scale=.1)}

svm_experiment(classifier, distributions, 'roc_auc')
svm_experiment(classifier, distributions, 'average_precision')

### Federated Learning: BSVClassifier

The whole work is inspired by https://arxiv.org/pdf/1602.05629.pdf paper.

Since we are working in an unsupervised setting, our metric is not Accuracy but Roc-Auc.

We will not use E, number of iterations per client update, since we are doing exact calculation and not gradient descent.
Same thing with the parameter B

In [None]:
import sys
import os

PATH_FLOD = "/Users/maxfrax/Desktop/FederatedLearningOutlierDetection"

sys.path.append(os.path.abspath(PATH_FLOD))

from flod.classifiers.federatedbsvclassifier import FederatedBSVClassifier

#### IID

We have about **809 samples**. 
Let's sort them and distribute them among **10 clients**.

There are 9 points that will be randomly left over.

In [None]:
total_clients = 10

data_per_client = int(len(X)/total_clients)

X = X[0:data_per_client*total_clients]
y = y[0:data_per_client*total_clients]

client_assignment = []

# Split the data among the clients in a balanced way
for i in range(total_clients):
    client_assignment.extend([i]*data_per_client)

client_assignment = np.array(client_assignment)

print(f'len Client assignment {len(client_assignment)} == len X {len(X)}')

In [None]:
# Inputs
# C: 0 aka 1 client, .1, .2, .5, 1 aka total_clients
# B: 'infinite' aka data_per_client, 1% of data_per_client

parameters = {
'client_fraction': [.1, .5,  1],
'max_rounds': [1, 3, 5],
'B': [5, 10]
}

test_fold = [0 if v < len(X) else 1 for v in range(len(X) * 2)]

search = GridSearchCV(FederatedBSVClassifier(normal_class_label=1, outlier_class_label=-1, total_clients=total_clients), parameters, cv=PredefinedSplit(test_fold=test_fold), refit=True, scoring='average_precision', error_score='raise', verbose=VERBOSE)
res_iid = search.fit(np.concatenate([X,X]), np.concatenate([y,y]), client_assignment=np.concatenate([client_assignment, client_assignment]))

iid_clf = res_iid.best_estimator_

print(res_iid.best_index_)
print(res_iid.best_score_)
print(res_iid.best_params_)


y_pred = iid_clf.predict(X)

In [None]:
conf_mat = confusion_matrix(y, y_pred)
ConfusionMatrixDisplay(conf_mat).plot()
plt.show()

display(conf_mat)

In [None]:
RocCurveDisplay.from_predictions(y, iid_clf.score_samples(X))

plt.show()

In [None]:
PrecisionRecallDisplay.from_predictions(y, iid_clf.score_samples(X))

plt.show()

In [None]:
iid_cv_results = pd.DataFrame(res_iid.cv_results_)

iid_cv_results['Datapoints Estimate'] = iid_cv_results['param_B'] * iid_cv_results['param_client_fraction'] * total_clients * iid_cv_results['param_max_rounds']

iid_cv_results = iid_cv_results.sort_values('rank_test_score')

iid_cv_results.to_csv('iid_cv_results.csv')

iid_cv_results

In [None]:
print(np.average(iid_cv_results['mean_test_score']))
print(np.std(iid_cv_results['mean_test_score']))

In [None]:
plt.plot(iid_clf.sv_count)
plt.show()

In [None]:
plt.hist(iid_clf.clf.betas_)
plt.show()

In [None]:
display(pd.DataFrame(iid_cv_results.groupby(['param_B'])['mean_test_score'].mean()))
display(pd.DataFrame(iid_cv_results.groupby(['param_B'])['mean_test_score'].std()))
display(pd.DataFrame(iid_cv_results.groupby(['param_client_fraction'])['mean_test_score'].mean()))
display(pd.DataFrame(iid_cv_results.groupby(['param_client_fraction'])['mean_test_score'].std()))
display(pd.DataFrame(iid_cv_results.groupby(['param_max_rounds'])['mean_test_score'].mean()))
display(pd.DataFrame(iid_cv_results.groupby(['param_max_rounds'])['mean_test_score'].std()))

In [None]:
scale_factor = 10000

# Plot 1
plot1_data = pd.DataFrame(iid_cv_results.groupby(['param_client_fraction', 'param_max_rounds']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot1_data.sort_values(by='Mean', ascending=False, inplace=True)
plot1_data['Scaled Std'] = plot1_data['Std'] * scale_factor
display(plot1_data)

fig = plt.figure()
ax1 = plot1_data.plot.scatter(x='param_client_fraction', y='param_max_rounds', c='Mean', s='Scaled Std', cmap='Greens', sharex=False)
plt.show()

# Plot 2
plot2_data = pd.DataFrame(iid_cv_results.groupby(['param_client_fraction', 'param_B']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot2_data.sort_values(by='Mean', ascending=False, inplace=True)
plot2_data['Scaled Std'] = plot2_data['Std'] * scale_factor
display(plot2_data)

fig = plt.figure()
ax2 = plot2_data.plot.scatter(x='param_client_fraction', y='param_B', c='Mean', s='Scaled Std', cmap='Greens', sharex=False)
plt.show()

# Plot 3
plot3_data = pd.DataFrame(iid_cv_results.groupby(['param_max_rounds', 'param_B']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot3_data.sort_values(by='Mean', ascending=False, inplace=True)
plot3_data['Scaled Std'] = plot3_data['Std'] * scale_factor
display(plot3_data)

fig = plt.figure()
ax3 = plot3_data.plot.scatter(x='param_max_rounds', y='param_B', c='Mean', s='Scaled Std', cmap='Greens', sharex=False)
plt.show()

In [None]:
def plot_fbsv(fbsv):
    frames = []
    #Plot client assignment
    plt.title('All the dataset colored by client assignment')
    plt.scatter(X[y==1,0], X[y==1,1], c=client_assignment[y==1])
    plt.scatter(X[y==-1,0], X[y==-1,1], marker='x', c=client_assignment[y==-1])
    plt.show()

    # Plot client training
    for round in fbsv.classifiers:
        plt.title(f'Round {round["round"]} - Clients training')
        for id, clf in round['clients'].items():
            plot_clf(clf, id)

        plt.legend()
        plt.show()
        plt.title(f'Round {round["round"]} - Global combine')
        plot_clf(round['global'], total_clients+1)
        plt.show()

    # Plot final classifier on the whole dataset
    plt.title("Whole dataset with final classifier")
    plot_clf(fbsv.clf, total_clients+2, X, y)
    plt.show()

In [None]:
plot_fbsv(iid_clf)

#### Non IID

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=total_clients)
client_assignment = kmeans.fit_predict(X)

In [None]:
from collections import Counter
Counter(client_assignment)

In [None]:
test_fold = [0 if v < len(X) else 1 for v in range(len(X) * 2)]

search = GridSearchCV(FederatedBSVClassifier(normal_class_label=1, outlier_class_label=-1, total_clients=total_clients), parameters, cv=PredefinedSplit(test_fold=test_fold), refit=True, scoring='average_precision', error_score='raise', verbose=VERBOSE)
res_n_iid = search.fit(np.concatenate([X,X]), np.concatenate([y,y]), client_assignment=np.concatenate([client_assignment, client_assignment]))

n_iid_clf = res_n_iid.best_estimator_

print(res_n_iid.best_index_)
print(res_n_iid.best_score_)
print(res_n_iid.best_params_)


y_pred = n_iid_clf.predict(X)

In [None]:
conf_mat = confusion_matrix(y, y_pred)
ConfusionMatrixDisplay(conf_mat).plot()
plt.show()

display(conf_mat)

In [None]:
RocCurveDisplay.from_predictions(y, n_iid_clf.score_samples(X))

plt.show()

In [None]:
PrecisionRecallDisplay.from_predictions(y, n_iid_clf.score_samples(X))

plt.show()

In [None]:
n_iid_cv_results = pd.DataFrame(res_n_iid.cv_results_)

n_iid_cv_results['Datapoints Estimate'] = n_iid_cv_results['param_B'] * n_iid_cv_results['param_client_fraction'] * total_clients * n_iid_cv_results['param_max_rounds']

n_iid_cv_results = n_iid_cv_results.sort_values('rank_test_score')

n_iid_cv_results.to_csv('n_iid_cv_results.csv')

n_iid_cv_results

In [None]:
print(np.average(n_iid_cv_results['mean_test_score']))
print(np.std(n_iid_cv_results['mean_test_score']))

In [None]:
plt.plot(n_iid_clf.sv_count)
plt.show()

In [None]:
plt.hist(n_iid_clf.clf.betas_)
plt.show()

In [None]:
display(pd.DataFrame(n_iid_cv_results.groupby(['param_B'])['mean_test_score'].mean()))
display(pd.DataFrame(n_iid_cv_results.groupby(['param_B'])['mean_test_score'].std()))
display(pd.DataFrame(n_iid_cv_results.groupby(['param_client_fraction'])['mean_test_score'].mean()))
display(pd.DataFrame(n_iid_cv_results.groupby(['param_client_fraction'])['mean_test_score'].std()))
display(pd.DataFrame(n_iid_cv_results.groupby(['param_max_rounds'])['mean_test_score'].mean()))
display(pd.DataFrame(n_iid_cv_results.groupby(['param_max_rounds'])['mean_test_score'].std()))

In [None]:
scale_factor = 10000

# Plot 1
plot1_data = pd.DataFrame(n_iid_cv_results.groupby(['param_client_fraction', 'param_max_rounds']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot1_data.sort_values(by='Mean', ascending=False, inplace=True)
plot1_data['Scaled Std'] = plot1_data['Std'] * scale_factor
display(plot1_data)

fig = plt.figure()
ax1 = plot1_data.plot.scatter(x='param_client_fraction', y='param_max_rounds', c='Mean', s='Scaled Std', cmap='Blues', sharex=False)
plt.show()

# Plot 2
plot2_data = pd.DataFrame(n_iid_cv_results.groupby(['param_client_fraction', 'param_B']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot2_data.sort_values(by='Mean', ascending=False, inplace=True)
plot2_data['Scaled Std'] = plot2_data['Std'] * scale_factor
display(plot2_data)

fig = plt.figure()
ax2 = plot2_data.plot.scatter(x='param_client_fraction', y='param_B', c='Mean', s='Scaled Std', cmap='Blues', sharex=False)
plt.show()

# Plot 3
plot3_data = pd.DataFrame(n_iid_cv_results.groupby(['param_max_rounds', 'param_B']).agg(Mean=('mean_test_score', 'mean'), Std=('mean_test_score', 'std'))).reset_index()
plot3_data.sort_values(by='Mean', ascending=False, inplace=True)
plot3_data['Scaled Std'] = plot3_data['Std'] * scale_factor
display(plot3_data)

fig = plt.figure()
ax3 = plot3_data.plot.scatter(x='param_max_rounds', y='param_B', c='Mean', s='Scaled Std', cmap='Blues', sharex=False)
plt.show()

In [None]:
plot_fbsv(n_iid_clf)