In [None]:
from sklearn import datasets

iris = datasets.load_iris()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
import numpy as np
from sklearn.model_selection import GridSearchCV

In [None]:
import matplotlib.pyplot as plt

# Let's try the algorithm

First of all we select the middle class as outlier (label = 1) since it's a bit harder than the others.
In fact by being in the middle of the other two, the sphere projection must split.

In [None]:
X = iris.data
y = [0] * 50 + [1] * 50 + [0] * 50
colors = ['red' if i == 1 else 'green' for i in y]

We scale the data and reduce the components to two, just to work in a easy to visualize environment.

In [None]:
X_scaled = MaxAbsScaler().fit_transform(X)
X_reduced = PCA(n_components=2).fit_transform(X_scaled)

In [None]:
plt.scatter(X_reduced[:,0],X_reduced[:,1], c=colors)
plt.show()

In [None]:
import sys

sys.path.append('..')

from flod.classifiers.bsvclassifier import BSVClassifier

We keep only 33% of the dataset as test set.

We shuffle and stratify, since the outliers are not the same amount as the regular points.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.33, shuffle=True, stratify=y)

In [None]:
clf = BSVClassifier(n_iter=10, q=10, penalization=200)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Precision {precision_score(y_test, y_pred)}')
print(f'Recall {recall_score(y_test, y_pred)}')
print(f'F1 {f1_score(y_test, y_pred)}')
print(f'Accuracy {accuracy_score(y_test, y_pred)}')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
gx = np.linspace(min(X_reduced[:,0])-.1, max(X_reduced[:,0])+.1, 50)
gy = np.linspace(min(X_reduced[:,1])-.1, max(X_reduced[:,1])+.1, 50)
gX, gY = np.meshgrid(gx, gy)
zs = np.array([clf._compute_r(np.array([x, y])) for x, y in zip(np.ravel(gX), np.ravel(gY))])
gZ = zs.reshape(gX.shape)
membership_contour = plt.contour(gX, gY, gZ, levels=(clf.radiuses_[clf.sv_i], ))
plt.clabel(membership_contour, inline=1)

plt.scatter(X_reduced[:,0],X_reduced[:,1], c=y)
plt.show()

In [None]:
from joblib import dump, load
dump(clf, 'iris.joblib') 

# Let's setup a Pipeline

We split train and test the same as above.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify=y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


pipe = Pipeline([
        ('scaler', MaxAbsScaler()),
        ('reduce_dim', PCA()),
        ('classifier', BSVClassifier())
])

In [None]:
params = {
    'scaler': [StandardScaler(), MaxAbsScaler(), RobustScaler(), QuantileTransformer()],
    'reduce_dim__n_components': range(iris.data.shape[1])[1:],
    'classifier__n_iter': [10],
    'classifier__penalization': np.random.uniform(1, 100, 3),
    'classifier__q': np.random.uniform(.1, 100, 5)
}

scoring = ['precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=3, shuffle=True)
search = GridSearchCV(pipe, params, n_jobs=-1, cv=cv, refit='f1', verbose=5, return_train_score=True, scoring=scoring)

In [None]:
search.fit(X_train, y_train)

In [None]:
print(f'Best params: {search.best_params_}')
print(f'Best score: {search.best_score_}')

In [None]:
search.score(X_test, y_test)

In [None]:
import pandas as pd

cv_res = pd.DataFrame(search.cv_results_)
cv_res

In [None]:
clf = search.best_estimator_['classifier']

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, search.predict(X_test)))

In [None]:
dump(clf, 'grid_iris.joblib') 

## Parameter Q analisys

Let's see how different Q values perform over the best classifier

In [None]:
columns = [col for col in cv_res.columns if 'mean' in col and 'test' in col]
columns.append('param_classifier__q')
columns.append('param_reduce_dim__n_components')

In [None]:
cv_res_fil = cv_res[cv_res['param_classifier__penalization'] == search.best_params_['classifier__penalization']][columns]
cv_res_fil = cv_res_fil[cv_res_fil['param_reduce_dim__n_components'] == search.best_params_['reduce_dim__n_components']]
cv_res_fil= cv_res_fil.drop('param_reduce_dim__n_components', axis=1)

In [None]:
cv_res_fil.sort_values('param_classifier__q', axis=0).plot(x='param_classifier__q')
plt.show()