In [1]:
import sys, getopt
import time, datetime
from typing import Callable
import math

In [7]:
# To store a single anomaly
class Term:
    def __init__(self, first, last, name):
        self._first_timestamp = first
        self._last_timestamp = last
        self._name = name

    def set_time(self, first, last):
        self._first_timestamp = first
        self._last_timestamp = last

    def get_time(self):
        return self._first_timestamp, self._last_timestamp

    def set_name(self, str):
        self._name = str

    def get_name(self):
        return self._name

    def __eq__(self, other):
        return self._first_timestamp == other.get_time()[0] and self._last_timestamp == other.get_time()[1]

In [69]:
class TaPR:
    def __init__(self, label, theta, delta):
        self._predictions = []  # list of Terms
        self._anomalies = []    # list of Terms
        self._ambiguous_inst = [] # list of Terms

        self._set_predictions = False
        self._set_anomalies = False

        assert(len(label) == 2)
        self._normal_lbl = label[0]
        self._anomal_lbl = label[1]

        self._theta = theta
        self._delta = delta
        pass

    def load_predictions(self, filename):
        ntoken = self._check_file_format(filename)

        if ntoken == 1:
            self._predictions = self._load_timeseries_file(filename)
        else:
            self._predictions = self._load_range_file(filename)
        self._set_prediction = True


    def load_anomalies(self, filename):
        ntoken = self._check_file_format(filename)

        if ntoken == 1:
            self._anomalies = self._load_timeseries_file(filename)
        else:
            self._anomalies = self._load_range_file(filename)
        self._set_anomalies = True

        self._gen_ambiguous()


    def _gen_ambiguous(self):
        for i in range(len(self._anomalies)):
            start_id = self._anomalies[i].get_time()[1] + 1
            end_id = start_id + self._delta -1

            #if the next anomaly occurs during the theta, update the end_id
            if i+1 < len(self._anomalies) and end_id > self._anomalies[i+1].get_time()[0]:
                end_id = self._anomalies[i+1].get_time()[0]

            self._ambiguous_inst.append(Term(start_id, end_id, str(i)))


    def _check_file_format(self, filename):
        # check the file's format
        f = open(filename, 'r', encoding='utf-8', newline='')
        line = f.readline()
        token = line.strip().split(',')
        f.close()
        return len(token)

    def _load_range_file(self, filename):
        temp_list = []
        f = open(filename, 'r', encoding='utf-8', newline='')
        for line in f.readlines():
            items = line.strip().split(',')
            if len(items) > 2:
                temp_list.append(Term(int(items[0]), int(items[1]), str(items[2])))
            else:
                temp_list.append(Term(int(items[0]), int(items[1]), 'undefined'))
        f.close()
        return temp_list

    def _load_timeseries_file(self, filename):
        return_list = []
        start_id = -1
        id = 0
        range_id = 1
        #set prev_val as a value different to normal and anomalous labels
        prev_val = self._anomal_lbl-1
        if prev_val == self._normal_lbl:
            prev_val -= 1

        f = open(filename, 'r', encoding='utf-8', newline='')
        for line in f.readlines():
            val = int(line.strip().split()[0])

            if val == self._anomal_lbl and prev_val == self._normal_lbl:
                start_id = id
            elif val == self._normal_lbl and prev_val == self._anomal_lbl:
                return_list.append(Term(start_id, id - 1, str(range_id)))
                range_id += 1
                start_id = 0
            elif start_id == -1 and val == self._anomal_lbl:
                start_id = 0

            id += 1
            prev_val = val
        f.close()
        if start_id != 0:
            return_list.append(Term(start_id, id-1, str(range_id)))

        return return_list


    def get_n_predictions(self):
        return len(self._predictions)

    def get_n_anomalies(self):
        return len(self._anomalies)

    # return a value with the detected anomaly list
    def TaR_d(self) -> {float, list}:
        total_score = 0.0
        detected_anomalies = []
        for anomaly_id in range(len(self._anomalies)):
            anomaly = self._anomalies[anomaly_id]
            ambiguous = self._ambiguous_inst[anomaly_id]

            max_score = self._sum_of_func(anomaly.get_time()[0], anomaly.get_time()[1],
                                          anomaly.get_time()[0], anomaly.get_time()[1], self._uniform_func)

            score = 0.0
            for prediction in self._predictions:
                score += self._overlap_and_subsequent_score(anomaly, ambiguous, prediction)

            if min(1.0, score / max_score) > self._theta:
                total_score += 1.0
                detected_anomalies.append(anomaly)

        if len(self._anomalies) == 0:
            return 0.0, []
        else:
            return total_score / len(self._anomalies), detected_anomalies

    # return a value with the detected prediction lists
    def TaP_d(self) -> {float, list}:
        correct_predictions = []
        total_score = 0.0
        for prediction in self._predictions:
            max_score = prediction.get_time()[1] - prediction.get_time()[0] + 1

            score = 0.0
            for anomaly_id in range(len(self._anomalies)):
                anomaly = self._anomalies[anomaly_id]
                ambiguous = self._ambiguous_inst[anomaly_id]

                score += self._overlap_and_subsequent_score(anomaly, ambiguous, prediction)

            if (score/max_score) > self._theta:
                total_score += 1.0
                correct_predictions.append(prediction)

        if len(self._predictions) == 0:
            return 0.0, []
        else:
            return total_score / len(self._predictions), correct_predictions


    def _detect(self, src_range: Term, ranges: list, theta: int) -> bool:
        rest_len = src_range.get_time()[1] - src_range.get_time()[0] + 1
        for dst_range in ranges:
            len = self._overlapped_len(src_range, dst_range)
            if len != -1:
                rest_len -= len
        return (float)(rest_len) / (src_range.get_time()[1] - src_range.get_time()[0] + 1) <= (1.0 - theta)

    def _overlapped_len(self, range1: Term, range2: Term) -> int:
        detected_start = max(range1.get_time()[0], range2.get_time()[0])
        detected_end = min(range1.get_time()[1], range2.get_time()[1])

        if detected_end < detected_start:
            return 0
        else:
            return detected_end - detected_start + 1

    def _min_max_norm(self, value: int, org_min: int, org_max: int, new_min: int, new_max: int) -> float:
        return (float)(new_min) + (float)(value - org_min) * (new_max - new_min) / (org_max - org_min)

    def _decaying_func(self, val: float) -> float:
        assert (-6 <= val <= 6)
        return 1 / (1 + math.exp(val))

    def _ascending_func(self, val: float) -> float:
        assert (-6 <= val <= 6)
        return 1 / (1 + math.exp(val * -1))

    def _uniform_func(self, val: float) -> float:
        return 1.0

    def _sum_of_func(self, start_time: int, end_time: int, org_start: int, org_end: int,
                     func: Callable[[float], float]) -> float:
        val = 0.0
        for timestamp in range(start_time, end_time + 1):
            val += func(self._min_max_norm(timestamp, org_start, org_end, -6, 6))
        return val

    def _overlap_and_subsequent_score(self, anomaly: Term, ambiguous: Term, prediction: Term) -> float:
        score = 0.0

        detected_start = max(anomaly.get_time()[0], prediction.get_time()[0])
        detected_end = min(anomaly.get_time()[1], prediction.get_time()[1])

        score += self._sum_of_func(detected_start, detected_end,
                                   anomaly.get_time()[0], anomaly.get_time()[1], self._uniform_func)

        detected_start = max(ambiguous.get_time()[0], prediction.get_time()[0])
        detected_end = min(ambiguous.get_time()[1], prediction.get_time()[1])

        score += self._sum_of_func(detected_start, detected_end,
                                   ambiguous.get_time()[0], ambiguous.get_time()[1], self._decaying_func)

        return score

    def TaR_p(self) -> float:
        total_score = 0.0
        for anomaly_id in range(len(self._anomalies)):
            anomaly = self._anomalies[anomaly_id]
            ambiguous = self._ambiguous_inst[anomaly_id]

            max_score = self._sum_of_func(anomaly.get_time()[0], anomaly.get_time()[1],
                                          anomaly.get_time()[0], anomaly.get_time()[1], self._uniform_func)

            score = 0.0
            for prediction in self._predictions:
                score += self._overlap_and_subsequent_score(anomaly, ambiguous, prediction)

            total_score += min(1.0, score/max_score)

        if len(self._anomalies) == 0:
            return 0.0
        else:
            return total_score / len(self._anomalies)

    def TaP_p(self) -> float:
        total_score = 0.0
        for prediction in self._predictions:
            max_score = prediction.get_time()[1] - prediction.get_time()[0] + 1

            score = 0.0
            for anomaly_id in range(len(self._anomalies)):
                anomaly = self._anomalies[anomaly_id]
                ambiguous = self._ambiguous_inst[anomaly_id]

                score += self._overlap_and_subsequent_score(anomaly, ambiguous, prediction)

            total_score += score/max_score

        if len(self._predictions) == 0:
            return 0.0
        else:
            return total_score / len(self._predictions)

In [65]:
anomaly_file = './samples/swat.csv'
predict_file = './samples/ocsvm.csv'



In [115]:
anomaly_tp = pd.read_csv(anomaly_file, header=None)    

In [121]:
ocsvm_predict = pd.read_csv(predict_file, header=None)    

In [105]:
anomaly_tp = anomaly_tp.values

In [107]:
np.unique(anomaly_tp, return_counts=True)

(array([-1,  1]), array([ 54621, 395298]))

In [72]:
import pandas as pd

In [111]:
test_y = pd.read_csv(predict_file, header=None)    

In [113]:
np.unique(test_y, return_counts=True)

(array([-1,  1]), array([269295, 180624]))

In [114]:
from sklearn.metrics import f1_score

In [127]:
tp2 = pd.read_csv('./samples/predict.csv', header=None)    

In [205]:
tp3 = pd.read_csv('./samples/swat.csv', header=None)    

In [206]:
tp3[tp3==1] = 0
tp3[tp3==-1] = 1

In [207]:
np.unique(tp3, return_counts=True)

(array([0, 1]), array([395298,  54621]))

In [208]:
tp3.to_csv('test.csv',index=False, header=None)

In [197]:
tp = pd.read_csv('test.csv', header=None)    

In [198]:
tp

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [209]:
tp2 = pd.read_csv('./samples/predict.csv', header=None)    

In [210]:
tp2[tp2==1] = 0
tp2[tp2==-1] = 1

In [211]:
tp2.to_csv('predict2.csv',index=False, header=None)

In [190]:
np.unique(tp3, return_counts=True)

(array([-1,  1]), array([ 54621, 395298]))

In [128]:
np.unique(tp2, return_counts=True)

(array([-1,  1]), array([ 47858, 402061]))

In [130]:
print(f1_score(anomaly_tp, tp2, pos_label=-1))

0.690190185306258


In [92]:
test_y.to_csv('test.csv',index=False, header=None)

In [93]:
tp = pd.read_csv('test.csv', header=None)    

In [95]:
tp

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [180]:
for prediction in ev._predictions:
    print(prediction.get_time())

(0, 2685)
(2687, 2807)
(2809, 11403)
(11437, 11566)
(11600, 11728)
(11763, 11891)
(11914, 11915)
(11919, 11921)
(11923, 11927)
(11929, 11932)
(11934, 11939)
(11986, 12095)
(12103, 12255)
(12257, 14244)
(14249, 15521)
(16089, 24068)
(24070, 24861)
(24863, 27322)
(27324, 30023)
(30033, 30037)
(30047, 30047)
(30050, 30107)
(30109, 34464)
(34471, 41740)
(41743, 45589)
(45591, 47536)
(47561, 51871)
(51880, 56231)
(56233, 56237)
(56262, 60498)
(60500, 60617)
(60619, 60633)
(60642, 63020)
(63022, 64938)
(64940, 64953)
(64963, 73638)
(73641, 73646)
(73663, 77910)
(77924, 90627)
(90629, 90642)
(90644, 111682)
(111684, 116003)
(116028, 116122)
(116515, 117811)
(117818, 117825)
(117827, 118662)
(118664, 121217)
(121219, 121223)
(121226, 121229)
(121248, 124159)
(124161, 125483)
(125485, 125603)
(125628, 127365)
(127369, 127373)
(127377, 127399)
(127401, 128703)
(128712, 129983)
(130007, 132884)
(133372, 134363)
(134388, 136691)
(136695, 138144)
(138169, 141000)
(141024, 141247)
(141253, 142404)
(

In [175]:
ev._predictions[0]

<__main__.Term at 0x7f167f3aa0b8>

In [199]:
delta = 600
theta = 0.001
alpha = 1.0
label = [0, 1]

In [200]:
ev = TaPR(label, theta, delta)

ev.load_anomalies( './samples/swat.csv')
ev.load_predictions('./samples/predict.csv')

for prediction in ev._predictions:
    print(prediction.get_time())

(2686, 2686)
(2808, 2808)
(11404, 11436)
(11567, 11599)
(11729, 11762)
(11892, 11913)
(11916, 11918)
(11922, 11922)
(11928, 11928)
(11933, 11933)
(11940, 11985)
(12096, 12102)
(12256, 12256)
(14245, 14248)
(15522, 16088)
(24069, 24069)
(24862, 24862)
(27323, 27323)
(30024, 30032)
(30038, 30046)
(30048, 30049)
(30108, 30108)
(34465, 34470)
(41741, 41742)
(45590, 45590)
(47537, 47560)
(51872, 51879)
(56232, 56232)
(56238, 56261)
(60499, 60499)
(60618, 60618)
(60634, 60641)
(63021, 63021)
(64939, 64939)
(64954, 64962)
(73639, 73640)
(73647, 73662)
(77911, 77923)
(90628, 90628)
(90643, 90643)
(111683, 111683)
(116004, 116027)
(116123, 116514)
(117812, 117817)
(117826, 117826)
(118663, 118663)
(121218, 121218)
(121224, 121225)
(121230, 121247)
(124160, 124160)
(125484, 125484)
(125604, 125627)
(127366, 127368)
(127374, 127376)
(127400, 127400)
(128704, 128711)
(129984, 130006)
(132885, 133371)
(134364, 134387)
(136692, 136694)
(138145, 138168)
(141001, 141023)
(141248, 141252)
(142405, 1424

In [201]:
tapd_value, correct_list = ev.TaP_d()
tapp_value = ev.TaP_p()
print("\t* Detection score:", "%0.5f"%tapd_value)
print("\t* Portion score:", "%0.5f"%tapp_value, "\n")

	* Detection score: 0.17399
	* Portion score: 0.15695 



In [160]:

for delta in range(401600,402600, 1000):
    print("delta {}".format(delta))
    ev = TaPR(label, theta, delta)

    ev.load_anomalies(anomaly_file)
    ev.load_predictions('./samples/predict.csv')
    tapd_value, correct_list = ev.TaP_d()
    tapp_value = ev.TaP_p()
    print("\t* Detection score:", "%0.5f"%tapd_value)
    print("\t* Portion score:", "%0.5f"%tapp_value, "\n")

delta 401600
	* Detection score: 1.00000
	* Portion score: 0.60051 



In [154]:

for delta in range(401600,402600, 1000):
    print("delta {}".format(delta))
    ev = TaPR(label, theta, delta)

    ev.load_anomalies(anomaly_file)
    ev.load_predictions(predict_file)
    tapd_value, correct_list = ev.TaP_d()
    tapp_value = ev.TaP_p()
    print("\t* Detection score:", "%0.5f"%tapd_value)
    print("\t* Portion score:", "%0.5f"%tapp_value, "\n")

delta 401600
	* Detection score: 0.66423
	* Portion score: 0.66831 

