In [None]:
!pip install stumpy

Collecting stumpy
  Downloading stumpy-1.10.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 5.2 MB/s 
Collecting scipy>=1.5
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 239 kB/s 
Installing collected packages: scipy, stumpy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed scipy-1.7.3 stumpy-1.10.0


In [None]:
a = []
while(1):
    a.append('1')

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import pathlib
import tqdm
import stumpy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
rootpath = pathlib.Path('/content/drive/MyDrive')
txt_dirpath = rootpath / 'AICSE Project Data'

In [None]:
import os
os.chdir("/content/drive/My Drive")
!ls

'AICSE Project Data'	 'Getting started.pdf'
'bank statement.pdf'	  kddcup.data_10_percent_corrected
'Colab Notebooks'	  Pickle_Files
'Dublu Bio-data.docx'	  vMalConv
'financial support.pdf'


In [None]:
# Parameter setting
min_window_size = 40
max_window_size = 800
growth_rate = 1.1
denom_threshold = 0.1
upper_threshold = 0.75
lower_threshold = 0.25
const_threshold = 0.05
min_coef = 0.5
small_quantile = 0.1
padding_length = 3
train_length = 10
use_gpu = False

In [None]:
# Determine window sizes
size = int(np.log(max_window_size / min_window_size) / np.log(growth_rate)) + 1
rates = np.full(size, growth_rate) ** np.arange(size)
ws = (min_window_size * rates).astype(int)

In [None]:
# Select stump function
if use_gpu:
    stump = stumpy.gpu_stump
else:
    stump = stumpy.stump

In [None]:
# Anomaly score names
names = [
    'orig_p2p',
    'diff_p2p',
    'acc_p2p',
    'orig_p2p_inv',
    'diff_small',
    'acc_std',
    'acc_std_inv',
    'orig_mp_novelty',
    'orig_np_novelty',
    'orig_mp_outlier',
    'orig_np_outlier',
]


In [None]:
def compute_score(X, number, split, w):
        
    # original time series (orig)
    seq = pd.DataFrame(X, columns=['orig'])
    
    # velocity (diff) and acceleration (acc)
    seq['diff'] = seq['orig'].diff(1)
    seq['acc'] = seq['diff'].diff(1)
        
    # standard deviation (std)
    for name in ['orig', 'acc']:
        seq[f'{name}_std'] = seq[name].rolling(w).std().shift(-w)
    
    # peak-to-peak (p2p)
    for name in ['orig', 'diff', 'acc']:
        rolling_max = seq[name].rolling(w).max()
        rolling_min = seq[name].rolling(w).min()
        seq[f'{name}_p2p'] = (rolling_max - rolling_min).shift(-w)
    
    # diff small
    diff_abs = seq['diff'].abs()
    cond = diff_abs <= diff_abs.quantile(small_quantile)
    seq['diff_small'] = cond.rolling(w).mean().shift(-w)
    
    # inverse (inv)
    for name in ['orig_p2p', 'acc_std']:
        numer = seq[name].mean()
        denom = seq[name].clip(lower=numer * denom_threshold)
        seq[f'{name}_inv'] = numer / denom
    
    # coef for penalizing subsequences with little change
    name = 'orig_p2p'
    mean = seq[name].mean()
    upper = mean * upper_threshold
    lower = mean * lower_threshold
    const = mean * const_threshold
    seq['coef'] = (seq[name] - lower) / (upper - lower)
    seq['coef'].clip(upper=1.0, lower=0.0, inplace=True)
    cond = (seq[name] <= const).rolling(2 * w).max().shift(-w) == 1
    seq.loc[cond, 'coef'] = 0.0
        
    # matrix profile value (mpv) and index (mpi)
    mpv = {}
    mpi = {}
    for mode in ['train', 'join', 'all']:
        if mode == 'train':
            mp = stump(X[:split], w)
        elif mode == 'join':
            mp = stump(X[split:], w, X[:split], ignore_trivial=False)
        elif mode == 'all':
            mp = stump(X, w)
        mpv[mode] = mp[:, 0].astype(float)
        mpi[mode] = mp[:, 1].astype(int)
        
    # matrix profile (mp) and normalized profile (np) for novelty detection (AB-join)
    numer = mpv['join']
    denom = mpv['train'][mpi['join']]
    begin = split
    end = begin + len(numer) - 1
    numer *= seq.loc[begin:end, 'coef'].values
    seq.loc[begin:end, 'orig_mp_novelty'] = numer
    with np.errstate(all='ignore'):
        seq.loc[begin:end, 'orig_np_novelty'] = numer / denom
    seq['orig_np_novelty'].clip(upper=1 / denom_threshold, inplace=True)

    # matrix profile (mp) and normalized profile (np) for outlier detection (self-join)
    numer = mpv['all']
    denom = mpv['all'][mpi['all']]
    begin = 0
    end = begin + len(numer) - 1
    numer *= seq.loc[begin:end, 'coef'].values
    seq.loc[begin:end, 'orig_mp_outlier'] = numer
    with np.errstate(all='ignore'):
        seq.loc[begin:end, 'orig_np_outlier'] = numer / denom
    seq['orig_np_outlier'].clip(upper=1 / denom_threshold, inplace=True)
    
    # Smooth and mask anomaly score
    padding = w * padding_length
    seq['mask'] = 0.0
    seq.loc[seq.index[w:-w-padding], 'mask'] = 1.0
    seq['mask'] = seq['mask'].rolling(padding, min_periods=1).sum() / padding
    for name in names:
        seq[f'{name}_score'] = seq[name].rolling(w).mean() * seq['mask']
    
    return seq

In [None]:
# Evaluate anomaly score for each time series
results = []
for txt_filepath in sorted(txt_dirpath.iterdir()):
    
    # Load time series
    X = np.loadtxt(txt_filepath)
    
    number = txt_filepath.stem.split('_')[0]
    split = int(txt_filepath.stem.split('_')[-1])
    print(f'\n{txt_filepath.name} {split}/{len(X)}', flush=True)
    
    # Evaluate anomaly score for each window size w
    for w in tqdm.tqdm(ws):
        
        # Skip long subsequence
        if w * train_length > split:
            continue
            
        # Compute anomaly score
        seq = compute_score(X, number, split, w)
        
        # Skip if coef is small
        if seq['coef'].mean() < min_coef:
            continue
            
        # Evaluate anomaly score
        for name in names:
            
            # Copy anomaly score
            y = seq[f'{name}_score'].copy()
            
            # Find local maxima
            cond = (y == y.rolling(w, center=True, min_periods=1).max())
            y.loc[~cond] = np.nan
            
            # Find 1st peak
            index1 = y.idxmax()
            value1 = y.max()
            
            # Skip if all score is NaN
            if not np.isfinite(value1):
                continue
                
            # Skip if train data has 1st peak
            begin = index1 - w
            end = index1 + w
            if begin < split:
                continue

            # Find 2nd peak
            y.iloc[begin:end] = np.nan
            index2 = y.idxmax()
            value2 = y.max()
            
            # Skip if 2nd peak height is zero
            if value2 == 0:
                continue
            
            # Evaluate rate of 1st peak height to 2nd peak height
            rate = value1 / value2
            results.append([number, w, name, rate, begin, end, index1, value1, index2, value2])


results = pd.DataFrame(results, columns=['number', 'w', 'name', 'rate', 'begin', 'end', 'index1', 'value1', 'index2', 'value2'])

# Make submission csv
submission = results.loc[results.groupby('number')['rate'].idxmax(), 'index1']
submission.index = np.arange(len(submission)) + 1
submission.name = 'location'
submission.index.name = 'No.'
submission.to_csv('result.csv')


001_UCR_Anomaly_35000.txt 35000/79795


100%|██████████| 32/32 [17:13<00:00, 32.30s/it]



002_UCR_Anomaly_35000.txt 35000/80001


100%|██████████| 32/32 [17:13<00:00, 32.29s/it]



003_UCR_Anomaly_35000.txt 35000/80000


100%|██████████| 32/32 [17:13<00:00, 32.28s/it]



004_UCR_Anomaly_2500.txt 2500/11000


100%|██████████| 32/32 [00:12<00:00,  2.59it/s]



005_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.90it/s]



006_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.90it/s]



007_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.91it/s]



008_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.90it/s]



009_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.90it/s]



010_UCR_Anomaly_4000.txt 4000/8184


100%|██████████| 32/32 [00:11<00:00,  2.88it/s]



011_UCR_Anomaly_10000.txt 10000/30000


100%|██████████| 32/32 [02:20<00:00,  4.39s/it]



012_UCR_Anomaly_15000.txt 15000/30000


100%|██████████| 32/32 [02:37<00:00,  4.92s/it]



013_UCR_Anomaly_15000.txt 15000/30000


100%|██████████| 32/32 [02:37<00:00,  4.93s/it]



014_UCR_Anomaly_8000.txt 8000/30000


100%|██████████| 32/32 [02:12<00:00,  4.14s/it]



015_UCR_Anomaly_5000.txt 5000/200000


100%|██████████| 32/32 [58:00<00:00, 108.78s/it]



016_UCR_Anomaly_5000.txt 5000/30000


100%|██████████| 32/32 [01:41<00:00,  3.16s/it]



017_UCR_Anomaly_5000.txt 5000/30000


100%|██████████| 32/32 [01:41<00:00,  3.17s/it]



018_UCR_Anomaly_8000.txt 8000/30000


100%|██████████| 32/32 [02:13<00:00,  4.16s/it]



019_UCR_Anomaly_5000.txt 5000/12000


100%|██████████| 32/32 [00:22<00:00,  1.42it/s]



020_UCR_Anomaly_5000.txt 5000/12000


100%|██████████| 32/32 [00:22<00:00,  1.42it/s]



021_UCR_Anomaly_5000.txt 5000/12000


100%|██████████| 32/32 [00:22<00:00,  1.42it/s]



022_UCR_Anomaly_4000.txt 4000/12000


100%|██████████| 32/32 [00:19<00:00,  1.62it/s]



023_UCR_Anomaly_5000.txt 5000/12000


100%|██████████| 32/32 [00:22<00:00,  1.42it/s]



024_UCR_Anomaly_3200.txt 3200/7501


100%|██████████| 32/32 [00:08<00:00,  3.90it/s]



025_UCR_Anomaly_2800.txt 2800/7501


100%|██████████| 32/32 [00:07<00:00,  4.17it/s]



026_UCR_Anomaly_1700.txt 1700/7601


100%|██████████| 32/32 [00:05<00:00,  6.03it/s]



027_UCR_Anomaly_1200.txt 1200/7501


100%|██████████| 32/32 [00:03<00:00,  8.66it/s]



028_UCR_Anomaly_1600.txt 1600/7500


100%|██████████| 32/32 [00:04<00:00,  6.61it/s]



029_UCR_Anomaly_2300.txt 2300/7500


100%|██████████| 32/32 [00:06<00:00,  4.84it/s]



030_UCR_Anomaly_3000.txt 3000/7500


100%|██████████| 32/32 [00:08<00:00,  3.94it/s]



031_UCR_Anomaly_2700.txt 2700/7500


100%|██████████| 32/32 [00:07<00:00,  4.21it/s]



032_UCR_Anomaly_1000.txt 1000/7321


100%|██████████| 32/32 [00:02<00:00, 10.93it/s]



033_UCR_Anomaly_4000.txt 4000/7415


100%|██████████| 32/32 [00:09<00:00,  3.27it/s]



034_UCR_Anomaly_1500.txt 1500/7654


100%|██████████| 32/32 [00:04<00:00,  6.98it/s]



035_UCR_Anomaly_2500.txt 2500/7501


100%|██████████| 32/32 [00:07<00:00,  4.51it/s]



036_UCR_Anomaly_4200.txt 4200/7501


100%|██████████| 32/32 [00:09<00:00,  3.22it/s]



037_UCR_Anomaly_5000.txt 5000/30001


100%|██████████| 32/32 [01:41<00:00,  3.16s/it]



038_UCR_Anomaly_5000.txt 5000/29950


100%|██████████| 32/32 [01:40<00:00,  3.15s/it]



039_UCR_Anomaly_5000.txt 5000/29950


100%|██████████| 32/32 [01:41<00:00,  3.16s/it]



040_UCR_Anomaly_6000.txt 6000/30066


100%|██████████| 32/32 [01:53<00:00,  3.54s/it]



041_UCR_Anomaly_7000.txt 7000/29826


100%|██████████| 32/32 [02:03<00:00,  3.86s/it]



042_UCR_Anomaly_7000.txt 7000/29859


100%|██████████| 32/32 [02:03<00:00,  3.87s/it]



043_UCR_Anomaly_10000.txt 10000/24667


100%|██████████| 32/32 [01:42<00:00,  3.19s/it]



044_UCR_Anomaly_9000.txt 9000/29931


100%|██████████| 32/32 [02:16<00:00,  4.26s/it]



045_UCR_Anomaly_14000.txt 14000/29931


100%|██████████| 32/32 [02:33<00:00,  4.80s/it]



046_UCR_Anomaly_16000.txt 16000/29931


100%|██████████| 32/32 [02:40<00:00,  5.01s/it]



047_UCR_Anomaly_18000.txt 18000/29931


100%|██████████| 32/32 [02:45<00:00,  5.18s/it]



048_UCR_Anomaly_3500.txt 3500/11334


100%|██████████| 32/32 [00:16<00:00,  1.98it/s]



049_UCR_Anomaly_3500.txt 3500/11406


100%|██████████| 32/32 [00:16<00:00,  1.96it/s]



050_UCR_Anomaly_3500.txt 3500/11308


100%|██████████| 32/32 [00:16<00:00,  1.97it/s]



051_UCR_Anomaly_3500.txt 3500/11388


100%|██████████| 32/32 [00:16<00:00,  1.96it/s]



052_UCR_Anomaly_3500.txt 3500/11308


100%|██████████| 32/32 [00:16<00:00,  1.99it/s]



053_UCR_Anomaly_1500.txt 1500/6684


100%|██████████| 32/32 [00:03<00:00,  8.19it/s]



054_UCR_Anomaly_2700.txt 2700/6684


100%|██████████| 32/32 [00:06<00:00,  4.90it/s]



055_UCR_Anomaly_10000.txt 10000/38501


100%|██████████| 32/32 [03:35<00:00,  6.72s/it]



056_UCR_Anomaly_5000.txt 5000/38379


100%|██████████| 32/32 [02:36<00:00,  4.90s/it]



057_UCR_Anomaly_6000.txt 6000/38269


100%|██████████| 32/32 [02:53<00:00,  5.42s/it]



058_UCR_Anomaly_10000.txt 10000/38501


100%|██████████| 32/32 [03:35<00:00,  6.74s/it]



059_UCR_Anomaly_20000.txt 20000/65000


100%|██████████| 32/32 [10:26<00:00, 19.58s/it]



060_UCR_Anomaly_22000.txt 22000/65000


100%|██████████| 32/32 [10:44<00:00, 20.14s/it]



061_UCR_Anomaly_24500.txt 24500/65000


100%|██████████| 32/32 [11:04<00:00, 20.78s/it]



062_UCR_Anomaly_18500.txt 18500/64000


100%|██████████| 32/32 [09:56<00:00, 18.65s/it]



063_UCR_Anomaly_18500.txt 18500/64000


100%|██████████| 32/32 [09:56<00:00, 18.64s/it]



064_UCR_Anomaly_23400.txt 23400/64000


100%|██████████| 32/32 [10:36<00:00, 19.88s/it]



065_UCR_Anomaly_3000.txt 3000/10001


100%|██████████| 32/32 [00:12<00:00,  2.58it/s]



066_UCR_Anomaly_3700.txt 3700/9998


100%|██████████| 32/32 [00:14<00:00,  2.27it/s]



067_UCR_Anomaly_5200.txt 5200/9998


100%|██████████| 32/32 [00:17<00:00,  1.86it/s]



068_UCR_Anomaly_1300.txt 1300/9998


100%|██████████| 32/32 [00:06<00:00,  5.19it/s]



069_UCR_Anomaly_3200.txt 3200/10028


100%|██████████| 32/32 [00:12<00:00,  2.53it/s]



070_UCR_Anomaly_17555.txt 17555/56123


100%|██████████| 32/32 [07:51<00:00, 14.73s/it]



071_UCR_Anomaly_23000.txt 23000/55000


100%|██████████| 32/32 [08:09<00:00, 15.31s/it]



072_UCR_Anomaly_20000.txt 20000/55000


100%|██████████| 32/32 [07:51<00:00, 14.74s/it]



073_UCR_Anomaly_60000.txt 60000/90000


100%|██████████| 32/32 [24:48<00:00, 46.51s/it]



074_UCR_Anomaly_4000.txt 4000/36001


100%|██████████| 32/32 [02:09<00:00,  4.03s/it]



075_UCR_Anomaly_4000.txt 4000/30000


100%|██████████| 32/32 [01:31<00:00,  2.85s/it]


In [None]:
os.chdir("/content/drive/My Drive")
!ls


'AICSE Project Data'   'financial support.pdf'		   result.csv
'bank statement.pdf'   'Getting started.pdf'		   vMalConv
'Colab Notebooks'       kddcup.data_10_percent_corrected
'Dublu Bio-data.docx'   Pickle_Files


In [None]:
results.head()

Unnamed: 0,number,w,name,rate,begin,end,index1,value1,index2,value2
0,1,40,orig_p2p,1.126924,66828,66908,66868,1474.047578,9025,1308.027334
1,1,40,acc_p2p,1.006119,61448,61528,61488,1108.742539,25068,1101.999579
2,1,40,diff_small,1.089958,56861,56941,56901,0.325625,56114,0.29875
3,1,40,orig_mp_novelty,1.077604,66357,66437,66397,1.504859,39327,1.396486
4,1,40,orig_np_novelty,1.105841,76724,76804,76764,2.147368,66887,1.941841


In [None]:
results = pd.DataFrame(results, columns=['number', 'w', 'name', 'rate', 'begin', 'end', 'index1', 'value1', 'index2', 'value2'])

# Make submission csv
submission = results.loc[results.groupby('number')['rate'].idxmax(), 'index1']
submission.index = np.arange(len(submission)) + 1
submission.name = 'location'
submission.index.name = 'No.'
submission.to_csv('result.csv')


NameError: ignored