<a href="https://colab.research.google.com/github/Nekokan1500/Machine-Learning/blob/main/Unsupervised_Learning/Example_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [2]:
# Generate sample data
x, y = make_classification(n_samples=1000, n_features=2, n_informative=2, 
                           n_redundant=0, n_repeated=0, n_classes=2, 
                           n_clusters_per_class=2, weights=[0.98,], 
                           class_sep=0.5, scale=1.0, shuffle=True, flip_y=0, 
                           random_state=0)

In [3]:
hourly_traffic = [120, 123, 124, 119, 196, 121, 118, 117, 500, 132]
pd.Series(hourly_traffic) > pd.Series(hourly_traffic).quantile(0.95)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [4]:
class PercentileDetection:
  def __init__(self, percentile=0.9):
    self.percentile = percentile
  def fit(self, x, y=None):
    self.threshold = pd.Series(x).quantile(self.percentile)
  def predict(self, x, y=None):
    return (pd.Series(x) > self.threshold).values
  def fit_predict(self, x, y=None):
    self.fit(x)
    return self.predict(x)

In [5]:
outlierd = PercentileDetection(percentile=0.95)
pd.DataFrame({'hourly_traffic': hourly_traffic, 
              'is_outlier': outlierd.fit_predict(hourly_traffic)}).style.apply(
                  lambda row: ['font-weight: bold']*len(row)
                    if row['is_outlier'] == True
                    else ['font-weight: normal']*len(row), axis=1
              )

Unnamed: 0,hourly_traffic,is_outlier
0,120,False
1,123,False
2,124,False
3,119,False
4,196,False
5,121,False
6,118,False
7,117,False
8,500,True
9,132,False


In [6]:
# Using percentiles for multi-dimensional data
class PercentileDetection:
  def __init__(self, percentile=0.9):
    self.percentile = percentile
  def fit(self, x, y=None):
    self.thresholds = [pd.Series(x[:,i]).quantile(self.percentile) 
                      for i in range(x.shape[1])]
  def predict(self, x, y=None):
    return (x > self.thresholds).max(axis=1)
  def fit_predict(self, x, y=None):
    self.fit(x)
    return self.predict(x)

In [7]:
outlierd = PercentileDetection(percentile=0.98)
y_pred = outlierd.fit_predict(x)

print('Precision: {:.02%}, Recall: {:.02%} [Percentile Detection]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 4.00%, Recall: 5.00% [Percentile Detection]


In [23]:
# Detecting outliers using EllipticEnvelope

ee = EllipticEnvelope(random_state=0)
y_pred = ee.fit_predict(x) == -1
print('Precision: {:.02%}, Recall: {:.02%} [Elliptic Envelope]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 9.00%, Recall: 45.00% [Elliptic Envelope]


In [22]:
# Outlier and novelty detection using Local Outlier Factor (LOF)
lof = LocalOutlierFactor(n_neighbors=50)
y_pred = lof.fit_predict(x) == -1
print('Precision: {:.02%}, Recall: {:.02%} [Local Outlier Factor]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 26.00%, Recall: 65.00% [Local Outlier Factor]


In [16]:
# Compare performance between different outlier factor score thresholds
# Once the LOF algorithm is fitted, it stores its outlier factor scores in 
# negative_outlier_factor_. A sample is more likely to be an outlier if the 
# score is closer to -1.
lof = LocalOutlierFactor(n_neighbors=50)
lof.fit(x)

for quantile in [0.01, 0.02, 0.1]:
  y_pred = lof.negative_outlier_factor_ < np.quantile(lof.negative_outlier_factor_, quantile)
  print('LOF: Precision: {:0.02%}, Recall: {:.02%} [Quantile={:.0%}]'.format(
      precision_score(y, y_pred, pos_label=1),
      recall_score(y, y_pred, pos_label=1), quantile
  ))

LOF: Precision: 80.00%, Recall: 40.00% [Quantile=1%]
LOF: Precision: 50.00%, Recall: 50.00% [Quantile=2%]
LOF: Precision: 14.00%, Recall: 70.00% [Quantile=10%]


In [18]:
# Novelty detection using LOF
x_inliers = x[y==0]

lof = LocalOutlierFactor(n_neighbors=50, novelty=True)
lof.fit(x_inliers)
y_pred = lof.predict(x) == -1

print('Precision: {:.02%}, Recall: {:.02%} [Local Outlier Factor]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 26.53%, Recall: 65.00% [Local Outlier Factor]


In [21]:
# Detecting outliers using isolation forest
iforest = IsolationForest(n_estimators=200, n_jobs=-1, random_state=10)
y_pred = iforest.fit_predict(x) == -1

print('Precision: {:.02%}, Recall: {:.02%} [Isolation Forest]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 6.45%, Recall: 60.00% [Isolation Forest]
