In [1]:
# !pip install sranodec

In [2]:
import pandas as pd
import numpy as np
import main_outlier_detection as mod
import sss

In [3]:
# Case 1. Isolation Forest, KNN imputation, 95 percentile
config1 = {
          'algorithm': 'IF', # outlier detection에 활용할 알고리즘 정의, {'SR', 'LOF', 'MoG', 'KDE', 'IF'} 중 택 1
          'imputation': 'KNN', # outlier를 대체(impute/replace)하기 위한 방법론 정의, {'KNN', 'Stats'} 중 택 1

          'alg_parameter': {
              'percentile': 95, # 예측시 활용되는 outlier 임계값, int or float
              'IF_estimators': 100, # ensemble에 활용하는 모델 개수, int(default: 100, 데이터 크기에 적합하게 설정)
              'IF_max_samples': 'auto', # 각 모델에 사용하는 샘플 개수(샘플링 적용), int or float(default: 'auto')
              'IF_contamination': 'auto', # 모델 학습시 활용되는 데이터의 outlier 비율, ‘auto’ or float(default: ’auto’, float인 경우 0 초과, 0.5 이하로 설정)
              'IF_max_features': 1.0, # 각 모델에 사용하는 변수 개수(샘플링 적용), int or float(default: 1.0)
              'IF_bootstrap': False}, # bootstrap적용 여부, bool(default: False)
          'imp_parameter': {
              'KNN_missing_values': np.nan, # 대체하기 위한 부분 구분자, int, float, str, np.nan or None(default: np.nan)
              'KNN_neighbors': 5, # 대체에 참고하기 위한 이웃 개수, int(default: 5)
              'KNN_weights': 'uniform', # 예측하는 과정에서 이웃에 부여할 가중치 여부, {‘uniform’, ‘distance’} or callable(default: ’uniform’)
              'KNN_metric': 'nan_euclidean'} # 이웃을 정의하기 위한 거리 척도, {‘nan_euclidean’} or callable(default: ’nan_euclidean’)
          }

# Case 2. Kernel Density Estimation, KNN imputation, 95 percentile
config2 = {
          'algorithm': 'KDE', # outlier detection에 활용할 알고리즘 정의, {'SR', 'LOF', 'MoG', 'KDE', 'IF'} 중 택 1
          'imputation': 'KNN', # outlier를 대체(impute/replace)하기 위한 방법론 정의, {'KNN', 'Stats'} 중 택 1
          
          'alg_parameter': {
              'percentile': 95, # 예측시 활용되는 outlier 임계값, int or float
              'KDE_bandwidth': 0.2, # kernel의 대역폭, float(default: 1.0)
              'KDE_algorithm': 'auto', # 사용할 tree 알고리즘, {‘kd_tree’,‘ball_tree’,‘auto’}(default: ’auto’) 중 택 1
              'KDE_kernel': 'gaussian', # kernel 종류, {'gaussian’, ‘tophat’, ‘epanechnikov’, ‘exponential’, ‘linear’, ‘cosine’}(default: ’gaussian’) 중 택 1
              'KDE_metric': 'euclidean', # 사용할 거리 척도, str(default: ’euclidean’)
              'KDE_breadth_first': True, # breadth(너비) / depth(깊이) 중 우선순위 방식 정의, bool, True: breadth or False: depth
              'KDE_leaf_size': 40}, # tree 알고리즘에서의 leaf node 개수, int(default: 40)

          'imp_parameter': {
              'KNN_missing_values': np.nan, # 대체하기 위한 부분 구분자, int, float, str, np.nan or None(default: np.nan)
              'KNN_neighbors': 5, # 대체에 참고하기 위한 이웃 개수, int(default: 5)
              'KNN_weights': 'uniform', # 예측하는 과정에서 이웃에 부여할 가중치 여부, {‘uniform’, ‘distance’} or callable(default: ’uniform’)
              'KNN_metric': 'nan_euclidean'} # 이웃을 정의하기 위한 거리 척도, {‘nan_euclidean’} or callable(default: ’nan_euclidean’)
          }

# Case 3. Local Outlier Factor, Statistics imputation, 95 percentile
config3 = {
          'algorithm': 'LOF', # outlier detection에 활용할 알고리즘 정의, {'SR', 'LOF', 'MoG', 'KDE', 'IF'} 중 택 1
          'imputation': 'Stats', # outlier를 대체(impute/replace)하기 위한 방법론 정의, {'KNN', 'Stats'} 중 택 1
          
          'alg_parameter': {
              'percentile': 95, # 예측시 활용되는 outlier 임계값, int or float
              'LOF_neighbors': 5, # 가까운 이웃 개수, int(default: 20)
              'LOF_algorithm': 'auto', # 가까운 이웃을 정의하기 위한 알고리즘, {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}(default: ’auto’) 중 택 1
              'LOF_leaf_size': 30, # tree 알고리즘에서의 leaf node 개수, int(default: 30)
              'LOF_metric': 'minkowski'}, # 이웃을 정의하기 위한 거리 척도, str or callable(default: ’minkowski’)

          'imp_parameter': {
              'Stats_missing_values': np.nan, # 대체하기 위한 부분 구분자, int, float, str, np.nan or None(default: np.nan)
              'Stats_strategy': 'mean'} # 대체하기 위한 통계 값 연산 방식, {'mean', 'median', 'most_frequent', 'constant'}(default: 'mean') 중 택 1
          }

# Case 4. Mixture of Gaussian, Statistics imputation, 90 percentile
config4 = {
          'algorithm': 'MoG', # outlier detection에 활용할 알고리즘 정의, {'SR', 'LOF', 'MoG', 'KDE', 'IF'} 중 택 1
          'imputation': 'Stats', # outlier를 대체(impute/replace)하기 위한 방법론 정의, {'KNN', 'Stats'} 중 택 1
          
          'alg_parameter': {
              'percentile': 90, # 예측시 활용되는 outlier 임계값, int or float
              'MoG_components': 2, # mixture에 활용하는 component의 개수, int(default: 1)
              'MoG_covariance': 'full', # {‘full’, ‘tied’, ‘diag’, ‘spherical’}(default: ’full’) 중 택 1
              'MoG_max_iter': 100}, # EM 방법론 반복 횟수, int(default: 100)

          'imp_parameter': {
              'Stats_missing_values': np.nan, # 대체하기 위한 부분 구분자, int, float, str, np.nan or None(default: np.nan)
              'Stats_strategy': 'mean'} # 대체하기 위한 통계 값 연산 방식, {'mean', 'median', 'most_frequent', 'constant'}(default: 'mean') 중 택 1
          }

# Case 5. Spectral Residual, KNN imputation, 90 percentile
config5 = {
          'algorithm': 'SR', # outlier detection에 활용할 알고리즘 정의, {'SR', 'LOF', 'MoG', 'KDE', 'IF'} 중 택 1
          'imputation': 'KNN', # outlier를 대체(impute/replace)하기 위한 방법론 정의, {'KNN', 'Stats'} 중 택 1
          
          'alg_parameter': {
              'percentile': 90, # 예측시 활용되는 outlier 임계값, int or float
              'SR_series_window_size': 24, # series window 크기, int, 데이터 크기에 적합하게 설정
              'SR_spectral_window_size': 24, # spectral window 크기, int, 데이터 크기에 적합하게 설정
              'SR_score_window_size': 100}, # score window 크기, int, period보다 충분히 큰 size로 설정

          'imp_parameter': {
              'KNN_missing_values': np.nan, # 대체하기 위한 부분 구분자, int, float, str, np.nan or None(default: np.nan)
              'KNN_neighbors': 5, # 대체에 참고하기 위한 이웃 개수, int(default: 5)
              'KNN_weights': 'uniform', # 예측하는 과정에서 이웃에 부여할 가중치 여부, {‘uniform’, ‘distance’} or callable(default: ’uniform’)
              'KNN_metric': 'nan_euclidean'} # 이웃을 정의하기 위한 거리 척도, {‘nan_euclidean’} or callable(default: ’nan_euclidean’)
          }

In [4]:
data_dir = './data/uci_har_outlier_data.csv'
raw_data = pd.read_csv(data_dir) # shape: [10299, 40]
raw_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.224848,0.264106,-0.095246,0.278851,-0.465085,0.491936,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,-0.013429,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,-0.136204,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,-0.306081,0.482148,-0.470129,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.272505,0.434728,-0.315375,0.439744,-0.269069,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [5]:
# Case 1. Isolation Forest, KNN imputation, 95 percentile
config = config1
data_outlier = mod.DataOutlier(config, raw_data)
replaced_data, index_list = data_outlier.getResult()

100%|██████████| 40/40 [00:15<00:00,  2.56it/s]
100%|██████████| 40/40 [00:06<00:00,  5.79it/s]


In [6]:
replaced_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.224848,0.264106,-0.095246,0.278851,0.032428,0.037063,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,-0.013429,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,-0.136204,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,-0.306081,0.037063,-0.470129,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.272505,0.434728,-0.315375,0.439744,-0.269069,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [7]:
index_list[:10]

[[31, 0],
 [32, 0],
 [64, 0],
 [66, 0],
 [67, 0],
 [70, 0],
 [71, 0],
 [135, 0],
 [138, 0],
 [142, 0]]

In [8]:
# Case 2. Kernel Density Estimation, KNN imputation, 95 percentile
config = config2
data_outlier = mod.DataOutlier(config, raw_data)
replaced_data, index_list = data_outlier.getResult()

100%|██████████| 40/40 [03:32<00:00,  5.31s/it]
100%|██████████| 40/40 [00:06<00:00,  6.21it/s]


In [9]:
replaced_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.224848,0.264106,-0.095246,0.278851,0.021538,0.491936,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,-0.013429,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,-0.136204,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,-0.306081,0.482148,-0.470129,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.272505,0.434728,-0.315375,0.439744,-0.269069,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [10]:
index_list[:10]

[[27, 0],
 [31, 0],
 [32, 0],
 [64, 0],
 [66, 0],
 [67, 0],
 [70, 0],
 [71, 0],
 [135, 0],
 [138, 0]]

In [11]:
# Case 3. Local Outlier Factor, Statistics imputation, 95 percentile
config = config3
data_outlier = mod.DataOutlier(config, raw_data)
replaced_data, index_list = data_outlier.getResult()

100%|██████████| 40/40 [00:01<00:00, 30.10it/s]
100%|██████████| 40/40 [00:00<00:00, 300.60it/s]


In [12]:
replaced_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.224848,0.264106,-0.095246,0.278851,-0.465085,0.491936,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,,-0.997099,-0.98275,,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,-0.306081,0.482148,-0.470129,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.272505,0.434728,-0.315375,0.439744,-0.269069,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [13]:
index_list[:10]

[[37, 0],
 [55, 0],
 [63, 0],
 [93, 0],
 [96, 0],
 [112, 0],
 [120, 0],
 [139, 0],
 [158, 0],
 [172, 0]]

In [14]:
# Case 4. Mixture of Gaussian, Statistics imputation, 90 percentile
config = config4
data_outlier = mod.DataOutlier(config, raw_data)
replaced_data, index_list = data_outlier.getResult()

100%|██████████| 40/40 [00:01<00:00, 31.64it/s]
100%|██████████| 40/40 [00:00<00:00, 266.30it/s]


In [15]:
replaced_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,,...,-0.224848,0.264106,-0.095246,0.278851,,,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,,-0.960322,-0.998807,,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,-0.013429,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,,-0.99652,-0.963668,,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,-0.136204,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,,,,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,,-0.990482,-0.998321,,-0.990441,-0.942469,...,,0.434728,-0.315375,,,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [16]:
index_list[:10]

[[27, 0],
 [31, 0],
 [32, 0],
 [51, 0],
 [64, 0],
 [66, 0],
 [67, 0],
 [70, 0],
 [71, 0],
 [124, 0]]

In [17]:
# Case 5. Spectral Residual, KNN imputation, 90 percentile
config = config5
data_outlier = mod.DataOutlier(config, raw_data)
replaced_data, index_list = data_outlier.getResult()

100%|██████████| 40/40 [00:00<00:00, 97.42it/s] 
100%|██████████| 40/40 [00:13<00:00,  2.92it/s]


In [18]:
replaced_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"tBodyAcc-arCoeff()-Y,2","tBodyAcc-arCoeff()-Y,3","tBodyAcc-arCoeff()-Y,4","tBodyAcc-arCoeff()-Z,1","tBodyAcc-arCoeff()-Z,2","tBodyAcc-arCoeff()-Z,3","tBodyAcc-arCoeff()-Z,4","tBodyAcc-correlation()-X,Y","tBodyAcc-correlation()-X,Z","tBodyAcc-correlation()-Y,Z"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.224848,0.264106,-0.095246,0.278851,-0.465085,0.491936,-0.190884,0.376314,0.435129,0.66079
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.090963,0.29431,-0.281211,0.085988,-0.022153,-0.016657,-0.220643,-0.013429,-0.072692,0.579382
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.074507,0.342256,-0.332564,0.239281,-0.136204,0.173863,-0.299493,-0.124698,-0.181105,0.6089
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.15532,0.323154,-0.170813,0.294938,-0.306081,0.482148,-0.080967,-0.305693,-0.362654,0.507459
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.021878,0.434728,-0.315375,0.439744,-0.269069,0.179414,-0.088952,-0.155804,-0.189763,0.599213


In [19]:
index_list[:10]

[[31, 0],
 [32, 0],
 [64, 0],
 [65, 0],
 [66, 0],
 [67, 0],
 [70, 0],
 [71, 0],
 [135, 0],
 [194, 0]]