In [None]:
from sklearn import datasets

iris = datasets.load_iris()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import random

In [None]:
import matplotlib.pyplot as plt

In [None]:
from scipy.stats import randint, uniform
from sklearn.model_selection import cross_validate
from tqdm.notebook import tqdm

# Let's try the algorithm

First of all we select the middle class as outlier (label = 1) since it's a bit harder than the others.
In fact by being in the middle of the other two, the sphere projection must split.

In [None]:
classes = [1, 1, 1]

while sum(classes) <= 0 or sum(classes) >=3:
    classes = [random.choice([0, 1]) for _ in range(3)]
    
X = iris.data
y = [classes[0]] * 50 + [classes[1]] * 50 + [classes[2]] * 50
colors = ['red' if i == 1 else 'green' for i in y]

We scale the data and reduce the components to two, just to work in a easy to visualize environment.

In [None]:
X_scaled = MaxAbsScaler().fit_transform(X)
X_reduced = PCA(n_components=2).fit_transform(X_scaled)

In [None]:
plt.scatter(X_reduced[:,0],X_reduced[:,1], c=colors)
plt.show()

In [None]:
import sys

sys.path.append('..')

from flod.classifiers.bsvclassifier import BSVClassifier

We keep only 33% of the dataset as test set.

We shuffle and stratify, since the outliers are not the same amount as the regular points.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.33, shuffle=True, stratify=y)

In [None]:
import time
from scipy.stats.distributions import norm

start = time.time()

#norm(loc=1.0, scale=1.0), 
params = {
    'q': randint(0, 500),
    'c': uniform(),
    'p1': uniform(0, 10),
    'p2': uniform(0, 10),
    'p3': uniform(0, 10),
    'p4': uniform(0, 10),
}
clf = RandomizedSearchCV(BSVClassifier(), params, cv=5, n_jobs=4, refit='f1', verbose=10, return_train_score=False, n_iter=200)

clf.fit(X_train, y_train)

print(f'RandomizedSearchCV took: {time.time() - start}')

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Precision {precision_score(y_test, y_pred)}')
print(f'Recall {recall_score(y_test, y_pred)}')
print(f'F1 {f1_score(y_test, y_pred)}')
print(f'Accuracy {accuracy_score(y_test, y_pred)}')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
best_clf = clf.best_estimator_

best_clf

In [None]:
gx = np.linspace(min(X_reduced[:,0])-.2, max(X_reduced[:,0])+.2, 50)
gy = np.linspace(min(X_reduced[:,1])-.2, max(X_reduced[:,1])+.2, 50)
gX, gY = np.meshgrid(gx, gy)
zs = np.array([best_clf._compute_r(np.array([x, y]), 'tensorflow') for x, y in zip(np.ravel(gX), np.ravel(gY))])
gZ = zs.reshape(gX.shape)
membership_contour = plt.contour(gX, gY, gZ, levels=(best_clf.radius_, ))
plt.clabel(membership_contour, inline=1)

plt.scatter(X_reduced[:,0],X_reduced[:,1], c=colors)
plt.show()

In [None]:
from joblib import dump, load
dump(clf, 'iris.joblib') 

# Nested Cross validation

Once found the best hyper params each training set, how good and consistent are the metrics?

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, LeaveOneOut


pipe = Pipeline([
        ('scaler', MaxAbsScaler()),
        ('reduce_dim', PCA()),
        ('classifier', BSVClassifier())
])

params = {
    'scaler': [StandardScaler(), MaxAbsScaler(), RobustScaler(), QuantileTransformer()],
    'reduce_dim__n_components': randint(0, X.shape[1]),
    'classifier__q': randint(0, 500),
    'classifier__c': uniform()    
}

scoring = ['precision', 'recall', 'f1']

Inner loop aka RandomizedSearchCV to find the best model given the data.
Outer loop to see if the best model was just lucky or all the best models perform well.

In [None]:
outer_cv = StratifiedKFold(n_splits=10, shuffle=True)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True)


r_clf = RandomizedSearchCV(pipe, params, n_jobs=-1, cv=inner_cv, refit='f1', verbose=10, return_train_score=False, scoring=scoring, n_iter=100)
scores = cross_validate(r_clf, X=X, y=y, cv=outer_cv, n_jobs=-1, scoring=scoring, verbose=10, return_estimator=True, return_train_score=True)

In [None]:
import pandas as pd

cv_res = pd.DataFrame(scores)
cv_res['estimator_params'] = [est.best_params_ for est in cv_res['estimator']]
cv_res = cv_res.dropna()

cv_res

In [None]:
print(f'Average precision: {np.average(cv_res["test_precision"]):.4f}')
print(f'Std precision: {np.std(cv_res["test_precision"]):.4f}')

print(f'\nAverage recall: {np.average(cv_res["test_recall"]):.4f}')
print(f'Std recall: {np.std(cv_res["test_recall"]):.4f}')

print(f'\nAverage f1: {np.average(cv_res["test_f1"]):.4f}')
print(f'Std f1: {np.std(cv_res["test_f1"]):.2f}')

In [None]:
ix = cv_res['test_f1'].idxmax()

cv_res['estimator_params'][ix]