# Outlier analysis and anomaly detection

In [1]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from scipy import stats
# from sklearn.metrics import roc_auc_score
# from sklearn.utils import column_or_1d
# from sklearn.utils import check_consistent_length

from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.utils.utility import precision_n_scores
from pyod.utils.data import generate_data, get_outliers_inliers

import warnings
warnings.filterwarnings('ignore')

## 1 skin
### 1.1 数据集分析
在数据集的**benchmarks**目录下包含1500个**skin_benchmark**文件，每个benchmark文件中包含6000条记录，选取**R\G\B**三个属性以及**ground.truth**进行离群点分析与异常检测。

In [26]:
path = "./skin/benchmarks/"
files = []
for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    files.append(file_path)
train_rate = 0.8
num_files = len(files)
print("Number of benchmark files: " , num_files)
print("File 0: ", files[0])
df0 = pd.read_csv(files[0])
print("DataFrame shape:", df0.shape)
df0.head()

Number of benchmark files:  1500
File 0:  ./skin/benchmarks/skin_benchmark_0001.csv
DataFrame shape: (6000, 9)


Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,R,G,B
0,skin_point_117027,skin,binary,2,0.020405,nominal,-1.864336,-2.110524,-1.587289
1,skin_point_016927,skin,binary,1,0.00025,anomaly,0.384456,0.925785,1.747782
2,skin_point_240995,skin,binary,2,0.001285,nominal,-0.980882,-1.09286,-1.394351
3,skin_point_018861,skin,binary,1,0.000739,anomaly,-1.125447,-0.54232,0.383435
4,skin_point_087883,skin,binary,2,0.001053,nominal,-0.964819,-1.076177,-1.38057


提取特征：

In [15]:
df = pd.read_csv(files[np.random.randint(0, num_files)])
X1 = df['R'].values.reshape(-1,1)
X2 = df['G'].values.reshape(-1,1)
X3 = df['B'].values.reshape(-1,1)
X = np.concatenate((X1, X2, X3),axis=1)

### 1.2 CBLOF
CBLOF算法时基于聚类组的本地异常因子计算异常值分数。
CBLOF将数据集和由聚类算法生成的聚类模型作为输入。它使用参数alpha和beta将群集分为小群集和大群集。然后基于该点所属的聚类的大小以及到最近的大聚类的距离来计算异常分数。  
在本次分析中，将异常分数的值设置为**0.01**。

In [16]:
#设定异常值比例
outliers_fraction = 0.01
 
#定义CBLOF模型
clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
 
#训练数据
clf.fit(X)
# 预测异常值分数
scores_pred = clf.decision_function(X) * -1
        
# 预测异常值和正常值的数据
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

print('异常值数量:',n_outliers,'正常值数量:',n_inliers)

异常值数量: 60 正常值数量: 5940


### 1.3 HBOS
基于直方图的离群值检测（HBOS）是一种有效的无监督方法。它假设特征独立并通过构建直方图来计算异常程度， 在多变量异常检测中，可以计算每个单个特征的直方图，单独评分并在最后组合。 使用PyOD库时，其代码与CBLOF类似。  
在本次分析中，将异常分数的值设置为**0.01**。

In [17]:
outliers_fraction = 0.01
clf = HBOS(contamination=outliers_fraction)
clf.fit(X)
 
scores_pred = clf.decision_function(X) * -1        
 
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

print('异常值数量:',n_outliers,'正常值:',n_inliers)

异常值数量: 48 正常值: 5952


### 1.4 KNN
用于离群检测的pyod.models.knn.KNN, 对于数据，它与第k个最近邻居的距离可以被视为异常值。它可以被视为衡量密度的一种方法。其代码与之前的CBLOF非常相似。

In [18]:
outliers_fraction = 0.01
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = KNN(contamination=outliers_fraction)
clf.fit(X)
 
scores_pred = clf.decision_function(X) * -1
        
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
         
print('异常值数量: ',n_outliers,'正常值数量: ',n_inliers)

异常值数量:  42 正常值数量:  5958


## 2 abalone
### 2.1 数据集分析
在数据集的**benchmarks**目录下包含1725个**skin_benchmark**文件，每个benchmark文件中包含1888条记录，选取**V1-V7**七个属性以及**ground.truth**进行离群点分析与异常检测。

In [19]:
file = './abalone/benchmarks/abalone_benchmark_0010.csv'
df1 = pd.read_csv(file)
print("Benchmark:", df1.shape)
df1.head()

Benchmark: (1888, 16)


Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,V1,V2,V3,V4,V5,V6,V7,noise..1,noise..2,noise..3
0,abalone_point_1513,abalone,regression,9,0.478184,nominal,1.049253,0.928243,0.489721,1.129222,1.543647,1.125824,0.579509,0.676328,0.424414,-0.579626
1,abalone_point_0781,abalone,regression,13,0.156335,anomaly,0.299834,0.02135,0.609261,0.206485,-0.03995,0.51459,0.439425,0.323648,-1.087076,-0.795781
2,abalone_point_2164,abalone,regression,9,0.67995,nominal,0.216565,0.625945,0.250642,0.139191,0.079439,0.031076,0.367587,-1.087076,-0.079416,0.614324
3,abalone_point_0181,abalone,regression,14,0.190689,anomaly,0.383103,0.726711,0.967881,0.225857,0.178555,0.177043,0.403506,-2.34665,0.02135,-1.060876
4,abalone_point_0457,abalone,regression,15,0.164233,anomaly,0.965985,1.331307,0.848341,0.799891,0.678638,0.715294,1.085964,-0.079416,1.129775,0.469541


进一步提取特征：

In [20]:
X1 = df1['V1'].values.reshape(-1,1)
X2 = df1['V2'].values.reshape(-1,1)
X3 = df1['V3'].values.reshape(-1,1)
X4 = df1['V4'].values.reshape(-1,1)
X5 = df1['V5'].values.reshape(-1,1)
X6 = df1['V6'].values.reshape(-1,1)
X7 = df1['V7'].values.reshape(-1,1)
X = np.concatenate((X1, X2, X3, X4, X5, X6, X7),axis=1)

### 2.2 CBLOF

In [22]:
#设定异常值比例
outliers_fraction = 0.01
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
 
#定义CBLOF模型
clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
 
#训练数据
clf.fit(X)
# 预测异常值分数
scores_pred = clf.decision_function(X) * -1
        
# 预测异常值和正常值的数据
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
         
print('异常值数量:',n_outliers,'正常值数量:',n_inliers)

异常值数量: 19 正常值数量: 1869


### 2.3 HBOS

In [23]:
outliers_fraction = 0.01
clf = HBOS(contamination=outliers_fraction)
clf.fit(X)
 
scores_pred = clf.decision_function(X) * -1        
 
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

print('异常值数量:',n_outliers,'正常值:',n_inliers)

异常值数量: 19 正常值: 1869


### 2.4 KNN

In [24]:
outliers_fraction = 0.01
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = KNN(contamination=outliers_fraction)
clf.fit(X)
 
scores_pred = clf.decision_function(X) * -1
        
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
         
print('异常值数量: ',n_outliers,'正常值数量: ',n_inliers)

异常值数量:  17 正常值数量:  1871


Reference: https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/