Skip to content

Commit

Permalink
[MRG+1] Fix LOF and Isolation benchmarks (scikit-learn#9798)
Browse files Browse the repository at this point in the history
  • Loading branch information
albertcthomas authored and ogrisel committed Oct 25, 2017
1 parent f84581b commit 7f19dbe
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 56 deletions.
41 changes: 26 additions & 15 deletions benchmarks/bench_isolation_forest.py
Expand Up @@ -3,6 +3,17 @@
IsolationForest benchmark
==========================================
A test of IsolationForest on classical anomaly detection datasets.
The benchmark is run as follows:
1. The dataset is randomly split into a training set and a test set, both
assumed to contain outliers.
2. Isolation Forest is trained on the training set.
3. The ROC curve is computed on the test set using the knowledge of the labels.
Note that the smtp dataset contains a very small proportion of outliers.
Therefore, depending on the seed of the random number generator, randomly
splitting the data set might lead to a test set containing no outliers. In this
case a warning is raised when computing the ROC curve.
"""

from time import time
Expand All @@ -12,7 +23,7 @@
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh

print(__doc__)
Expand All @@ -30,15 +41,14 @@ def print_outlier_ratio(y):
print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))


np.random.seed(1)
random_state = 1
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))

# Set this to true for plotting score histograms for each dataset:
with_decision_function_histograms = False

# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

# Loop over all datasets for fitting and scoring the estimator:
for dat in datasets:
Expand All @@ -47,15 +57,16 @@ def print_outlier_ratio(y):
print('====== %s ======' % dat)
print('--- Fetching data...')
if dat in ['http', 'smtp', 'SF', 'SA']:
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
dataset = fetch_kddcup99(subset=dat, shuffle=True,
percent10=True, random_state=random_state)
X = dataset.data
y = dataset.target

if dat == 'shuttle':
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
X, y = sh(X, y, random_state=random_state)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
Expand All @@ -65,7 +76,7 @@ def print_outlier_ratio(y):
print('----- ')

if dat == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype(shuffle=True, random_state=random_state)
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
Expand All @@ -79,17 +90,17 @@ def print_outlier_ratio(y):
print('--- Vectorizing data...')

if dat == 'SF':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)

if dat == 'SA':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
x2 = lb.fit_transform(X[:, 2])
x3 = lb.fit_transform(X[:, 3])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)
Expand All @@ -108,7 +119,7 @@ def print_outlier_ratio(y):
y_test = y[n_samples_train:]

print('--- Fitting the IsolationForest estimator...')
model = IsolationForest(n_jobs=-1)
model = IsolationForest(n_jobs=-1, random_state=random_state)
tstart = time()
model.fit(X_train)
fit_time = time() - tstart
Expand Down
69 changes: 28 additions & 41 deletions benchmarks/bench_lof.py
Expand Up @@ -5,6 +5,16 @@
A test of LocalOutlierFactor on classical anomaly detection datasets.
Note that LocalOutlierFactor is not meant to predict on a test set and its
performance is assessed in an outlier detection context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.
"""

from time import time
Expand All @@ -14,31 +24,28 @@
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh

print(__doc__)

np.random.seed(2)
random_state = 2 # to control the random selection of anomalies in SA

# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['shuttle']

novelty_detection = True # if False, training set polluted by outliers
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

plt.figure()
for dataset_name in datasets:
# loading and vectorization
print('loading data')
if dataset_name in ['http', 'smtp', 'SA', 'SF']:
dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
percent10=False)
dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
random_state=random_state)
X = dataset.data
y = dataset.target

if dataset_name == 'shuttle':
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
Expand All @@ -47,7 +54,7 @@
y = (y != 1).astype(int)

if dataset_name == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype()
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
Expand All @@ -61,54 +68,34 @@

if dataset_name == 'SF':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)

if dataset_name == 'SA':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
lb.fit(X[:, 2])
x2 = lb.transform(X[:, 2])
lb.fit(X[:, 3])
x3 = lb.transform(X[:, 3])
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)

if dataset_name == 'http' or dataset_name == 'smtp':
y = (y != 'normal.').astype(int)

n_samples, n_features = np.shape(X)
n_samples_train = n_samples // 2
n_samples_test = n_samples - n_samples_train
y = (y != b'normal.').astype(int)

X = X.astype(float)
X_train = X[:n_samples_train, :]
X_test = X[n_samples_train:, :]
y_train = y[:n_samples_train]
y_test = y[n_samples_train:]

if novelty_detection:
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]

print('LocalOutlierFactor processing...')
model = LocalOutlierFactor(n_neighbors=20)
tstart = time()
model.fit(X_train)
model.fit(X)
fit_time = time() - tstart
tstart = time()

scoring = -model.decision_function(X_test) # the lower, the more normal
predict_time = time() - tstart
fpr, tpr, thresholds = roc_curve(y_test, scoring)
scoring = -model.negative_outlier_factor_ # the lower, the more normal
fpr, tpr, thresholds = roc_curve(y, scoring)
AUC = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1,
label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
predict_time)))
label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
% (dataset_name, AUC, fit_time)))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
Expand Down

0 comments on commit 7f19dbe

Please sign in to comment.