<a href="https://colab.research.google.com/github/Nekokan1500/Machine-Learning/blob/main/Unsupervised_Learning/Example_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score

In [2]:
# Generate sample data
x, y = make_classification(n_samples=1000, n_features=2, n_informative=2, 
                           n_redundant=0, n_repeated=0, n_classes=2, 
                           n_clusters_per_class=2, weights=[0.98,], 
                           class_sep=0.5, scale=1.0, shuffle=True, flip_y=0, 
                           random_state=0)

In [3]:
hourly_traffic = [120, 123, 124, 119, 196, 121, 118, 117, 500, 132]
pd.Series(hourly_traffic) > pd.Series(hourly_traffic).quantile(0.95)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [10]:
class PercentileDetection:
  def __init__(self, percentile=0.9):
    self.percentile = percentile
  def fit(self, x, y=None):
    self.threshold = pd.Series(x).quantile(self.percentile)
  def predict(self, x, y=None):
    return (pd.Series(x) > self.threshold).values
  def fit_predict(self, x, y=None):
    self.fit(x)
    return self.predict(x)

In [11]:
outlierd = PercentileDetection(percentile=0.95)
pd.DataFrame({'hourly_traffic': hourly_traffic, 
              'is_outlier': outlierd.fit_predict(hourly_traffic)}).style.apply(
                  lambda row: ['font-weight: bold']*len(row)
                    if row['is_outlier'] == True
                    else ['font-weight: normal']*len(row), axis=1
              )

Unnamed: 0,hourly_traffic,is_outlier
0,120,False
1,123,False
2,124,False
3,119,False
4,196,False
5,121,False
6,118,False
7,117,False
8,500,True
9,132,False


In [13]:
# Using percentiles for multi-dimensional data
class PercentileDetection:
  def __init__(self, percentile=0.9):
    self.percentile = percentile
  def fit(self, x, y=None):
    self.thresholds = [pd.Series(x[:,i]).quantile(self.percentile) 
                      for i in range(x.shape[1])]
  def predict(self, x, y=None):
    return (x > self.thresholds).max(axis=1)
  def fit_predict(self, x, y=None):
    self.fit(x)
    return self.predict(x)

In [18]:
outlierd = PercentileDetection(percentile=0.98)
y_pred = outlierd.fit_predict(x)

print('Precision: {:.02%}, Recall: {:.02%} [Percentile Detection]'.format(
    precision_score(y, y_pred, pos_label=1),
    recall_score(y, y_pred, pos_label=1)
))

Precision: 4.00%, Recall: 5.00% [Percentile Detection]
