# Imports

In [204]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFECV
from sklearn.svm import OneClassSVM
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import optuna

# Create an example dataset

Swap out with your dataset.  
The dataset here consists of a single gaussian-sampled inlier cluster with some randomly sampled outliers around it.  
For `n_features = 2` you can plot it. :)

In [205]:
n_good_samples = 500
n_bad_samples = 40
n_features = 10

np.random.seed(42)
# Generate inliers
X, y = make_blobs(n_samples=n_good_samples, centers=[(np.random.randn(n_features))], n_features=n_features, random_state=42)
# Generate random outliers
X_outliers = np.random.uniform(low=-10, high=10, size=(n_bad_samples, n_features))

if n_features == 2:
    plt.plot(X[:, 0], X[:, 1], 'o', label='inliers')
    plt.plot(X_outliers[:, 0], X_outliers[:, 1], 'x', label='outliers')

X = np.concatenate([X, X_outliers], axis=0)
y = np.concatenate([y, np.ones(X_outliers.shape[0]).astype(int)], axis=0)

# Split, prepare and inspect the data

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [207]:
# My code works with pandas inputs
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

In [208]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.747207,0.208184,-0.032336,1.755284,0.058919,-0.948488,3.444987,1.241268,-1.660778,1.199114
1,1.083977,-0.343892,0.730521,0.794858,1.513984,0.805925,0.839442,-0.567176,-0.855147,0.977063
2,0.280754,0.799213,2.282755,2.834723,-1.326295,-0.487813,1.234048,1.256689,0.216879,-0.296333
3,2.666651,-1.561947,-1.204154,2.306620,-0.912796,-0.552374,0.798056,0.507635,1.129064,1.344689
4,0.013653,0.008529,2.259909,2.419869,-0.502684,-1.125329,-0.572603,0.048281,-0.680605,-0.444619
...,...,...,...,...,...,...,...,...,...,...
427,-0.690279,0.919550,0.058576,0.587434,-0.412071,-1.146948,1.161782,1.111489,-0.121133,-0.355270
428,2.301062,-0.329168,1.367446,0.229757,-1.190590,0.238269,3.063329,1.123048,-0.782532,0.541851
429,-1.349474,-1.067775,-0.848841,0.873006,-0.317591,-1.683782,0.657353,-0.236523,-0.262207,0.611904
430,-8.576227,-3.620487,6.897506,-9.534561,6.289370,-4.362905,-7.636703,3.934743,2.578857,7.549440


Note the distribution of the anomaly samples

In [209]:
y_train.value_counts()

0    400
1     32
Name: count, dtype: int64

In [210]:
y_test.value_counts()

0    100
1      8
Name: count, dtype: int64

# One-class classification with RFECV feature selection and hyperparameter optimization using Optuna

## Required Utilities

In [211]:
# A version of the OneClassSVM that outputs 0 = good and 1 = bad instead of -1 = bad and 1 = 0
class BinaryOneClassSVM(OneClassSVM):
    def predict(self, X):
        y_pred = super().predict(X)
        return (y_pred == -1).astype(int) # -1 -> 1, 1 -> 0

# Special kfold for one-class classification with only good samples in the training splits
def split_with_one_class_KFold(X, y, n_splits, shuffle, random_state):
    # Note that StratifiedKFold would cause data leakage because it splits based on the target which is supposed to be one-class
    kfold = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    splits = kfold.split(X) # Iterator of (train_index, valid_index) tuples
    return [(train_index[y.iloc[train_index] == 0], valid_index) for train_index, valid_index in splits]

## Feature Selection

In [212]:
splits = split_with_one_class_KFold(X_train, y_train, n_splits=3, shuffle=True, random_state=42) 

classifier = Pipeline([
    # By putting the scaler into a Pipeline with the clf, the scaler will always be 
    # fitted correctly on the train data in every fold of the CV inside the RFECV
    ("scaler", StandardScaler()),
     # Only linear kernel is supported for feature selection. Needs to be *Binary*OneClassSVM for RFECV.
    ("classifier", BinaryOneClassSVM(kernel="linear"))
])

# Perform RFECV
importance_getter = "named_steps.classifier.coef_" # Important when using a Pipeline
rfecv = RFECV(classifier, cv=splits, n_jobs=3, scoring="f1", importance_getter=importance_getter)
X_train = pd.DataFrame(rfecv.fit_transform(X_train, y_train), index=X_train.index) # X_train now has less features from here on

In [213]:
X_train

Unnamed: 0,0,1,2,3,4
0,-0.032336,1.755284,0.058919,3.444987,1.199114
1,0.730521,0.794858,1.513984,0.839442,0.977063
2,2.282755,2.834723,-1.326295,1.234048,-0.296333
3,-1.204154,2.306620,-0.912796,0.798056,1.344689
4,2.259909,2.419869,-0.502684,-0.572603,-0.444619
...,...,...,...,...,...
427,0.058576,0.587434,-0.412071,1.161782,-0.355270
428,1.367446,0.229757,-1.190590,3.063329,0.541851
429,-0.848841,0.873006,-0.317591,0.657353,0.611904
430,6.897506,-9.534561,6.289370,-7.636703,7.549440


## Hyperparameter optimization

In [214]:
def objective(trial, X_train, y_train):
    # Hyperparameters for OneClassSVM (feel free to add more)
    nu = trial.suggest_float("nu", 0.01, 0.5)
    gamma = trial.suggest_float("gamma", 0.01, 0.5)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"])

    # Create the classifier
    classifier = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", BinaryOneClassSVM(nu=nu, gamma=gamma, kernel=kernel))
    ])

    # Perform cross-validation
    splits = split_with_one_class_KFold(X_train, y_train, n_splits=5, shuffle=True, random_state=42)
    return cross_val_score(classifier, X_train, y_train, cv=splits, scoring="f1").mean()

In [215]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=200)

[I 2024-10-17 15:01:42,576] A new study created in memory with name: no-name-4171638c-cf78-417a-baa1-6a2898375806
[I 2024-10-17 15:01:42,595] Trial 0 finished with value: 0.17245526138383283 and parameters: {'nu': 0.11997693416829158, 'gamma': 0.35559424893914277, 'kernel': 'poly'}. Best is trial 0 with value: 0.17245526138383283.
[I 2024-10-17 15:01:42,616] Trial 1 finished with value: 0.2996023391812866 and parameters: {'nu': 0.32610577685480063, 'gamma': 0.26871949911833565, 'kernel': 'rbf'}. Best is trial 1 with value: 0.2996023391812866.
[I 2024-10-17 15:01:42,636] Trial 2 finished with value: 0.3228138528138528 and parameters: {'nu': 0.24727189668973953, 'gamma': 0.3028232956615806, 'kernel': 'rbf'}. Best is trial 2 with value: 0.3228138528138528.
[I 2024-10-17 15:01:42,656] Trial 3 finished with value: 0.30161036969547605 and parameters: {'nu': 0.06545331951097559, 'gamma': 0.4427470397234579, 'kernel': 'rbf'}. Best is trial 2 with value: 0.3228138528138528.
[I 2024-10-17 15:01:

## Refit and evaluation of the best model

In [216]:
best_params = study.best_params

In [217]:
# Remove all bad (positive class) samples from the training data for one-class classification
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]

In [218]:
# RFECV is already fit and X_train already transformed, so the other steps are also fit beforehand.
# Final pipeline object can be used for predicting.

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

base_classifier = BinaryOneClassSVM(**best_params)
base_classifier.fit(X_train, y_train)

pipeline_steps = []
pipeline_steps.append(('use_RFECV', rfecv))
pipeline_steps.append(('scaler', scaler)) # Scaler has to come after feature selection
pipeline_steps.append(('classifier', base_classifier))
classifier = Pipeline(pipeline_steps)

In [219]:
# Evaluate on the test set
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       100
           1       0.80      1.00      0.89         8

    accuracy                           0.98       108
   macro avg       0.90      0.99      0.94       108
weighted avg       0.99      0.98      0.98       108

