[MRG+1] Fix LOF and Isolation benchmarks (scikit-learn#9798)

NelleV · Oct 25, 2017 · 7f19dbe · 7f19dbe
1 parent f84581b
commit 7f19dbe
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 56 deletions.
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -3,6 +3,17 @@
 IsolationForest benchmark
 ==========================================
 A test of IsolationForest on classical anomaly detection datasets.
+
+The benchmark is run as follows:
+1. The dataset is randomly split into a training set and a test set, both
+assumed to contain outliers.
+2. Isolation Forest is trained on the training set.
+3. The ROC curve is computed on the test set using the knowledge of the labels.
+
+Note that the smtp dataset contains a very small proportion of outliers.
+Therefore, depending on the seed of the random number generator, randomly
+splitting the data set might lead to a test set containing no outliers. In this
+case a warning is raised when computing the ROC curve.
 """
 
 from time import time
@@ -12,7 +23,7 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import shuffle as sh
 
 print(__doc__)
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
     print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
 
 
-np.random.seed(1)
+random_state = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
 
 # Set this to true for plotting score histograms for each dataset:
 with_decision_function_histograms = False
 
-# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
-# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
 # Loop over all datasets for fitting and scoring the estimator:
 for dat in datasets:
@@ -47,15 +57,16 @@ def print_outlier_ratio(y):
     print('====== %s ======' % dat)
     print('--- Fetching data...')
     if dat in ['http', 'smtp', 'SF', 'SA']:
-        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        dataset = fetch_kddcup99(subset=dat, shuffle=True,
+                                 percent10=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
 
     if dat == 'shuttle':
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
+        X, y = sh(X, y, random_state=random_state)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -65,7 +76,7 @@ def print_outlier_ratio(y):
         print('----- ')
 
     if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype(shuffle=True, random_state=random_state)
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -79,17 +90,17 @@ def print_outlier_ratio(y):
     print('--- Vectorizing data...')
 
     if dat == 'SF':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)
 
     if dat == 'SA':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
-        x2 = lb.fit_transform(X[:, 2])
-        x3 = lb.fit_transform(X[:, 3])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
         y = (y != b'normal.').astype(int)
         print_outlier_ratio(y)
@@ -108,7 +119,7 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1)
+    model = IsolationForest(n_jobs=-1, random_state=random_state)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
@@ -5,6 +5,16 @@
 
 A test of LocalOutlierFactor on classical anomaly detection datasets.
 
+Note that LocalOutlierFactor is not meant to predict on a test set and its
+performance is assessed in an outlier detection context:
+1. The model is trained on the whole dataset which is assumed to contain
+outliers.
+2. The ROC curve is computed on the same dataset using the knowledge of the
+labels.
+In this context there is no need to shuffle the dataset because the model
+is trained and tested on the whole dataset. The randomness of this benchmark
+is only caused by the random selection of anomalies in the SA dataset.
+
 """
 
 from time import time
@@ -14,31 +24,28 @@
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
 from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils import shuffle as sh
 
 print(__doc__)
 
-np.random.seed(2)
+random_state = 2  # to control the random selection of anomalies in SA
 
 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['shuttle']
-
-novelty_detection = True  # if False, training set polluted by outliers
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
 
+plt.figure()
 for dataset_name in datasets:
     # loading and vectorization
     print('loading data')
     if dataset_name in ['http', 'smtp', 'SA', 'SF']:
-        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
-                                 percent10=False)
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
+                                 random_state=random_state)
         X = dataset.data
         y = dataset.target
 
     if dataset_name == 'shuttle':
         dataset = fetch_mldata('shuttle')
         X = dataset.data
         y = dataset.target
-        X, y = sh(X, y)
         # we remove data with label 4
         # normal data are then those of class 1
         s = (y != 4)
@@ -47,7 +54,7 @@
         y = (y != 1).astype(int)
 
     if dataset_name == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype()
         X = dataset.data
         y = dataset.target
         # normal data are those with attribute 2
@@ -61,54 +68,34 @@
 
     if dataset_name == 'SF':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
         X = np.c_[X[:, :1], x1, X[:, 2:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'SA':
         lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
-        lb.fit(X[:, 2])
-        x2 = lb.transform(X[:, 2])
-        lb.fit(X[:, 3])
-        x3 = lb.transform(X[:, 3])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)
 
     if dataset_name == 'http' or dataset_name == 'smtp':
-        y = (y != 'normal.').astype(int)
-
-    n_samples, n_features = np.shape(X)
-    n_samples_train = n_samples // 2
-    n_samples_test = n_samples - n_samples_train
+        y = (y != b'normal.').astype(int)
 
     X = X.astype(float)
-    X_train = X[:n_samples_train, :]
-    X_test = X[n_samples_train:, :]
-    y_train = y[:n_samples_train]
-    y_test = y[n_samples_train:]
-
-    if novelty_detection:
-        X_train = X_train[y_train == 0]
-        y_train = y_train[y_train == 0]
 
     print('LocalOutlierFactor processing...')
     model = LocalOutlierFactor(n_neighbors=20)
     tstart = time()
-    model.fit(X_train)
+    model.fit(X)
     fit_time = time() - tstart
-    tstart = time()
-
-    scoring = -model.decision_function(X_test)  # the lower, the more normal
-    predict_time = time() - tstart
-    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    scoring = -model.negative_outlier_factor_  # the lower, the more normal
+    fpr, tpr, thresholds = roc_curve(y, scoring)
     AUC = auc(fpr, tpr)
     plt.plot(fpr, tpr, lw=1,
-             label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
-                    'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
-                                            predict_time)))
+             label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
+                    % (dataset_name, AUC, fit_time)))
 
 plt.xlim([-0.05, 1.05])
 plt.ylim([-0.05, 1.05])