In [38]:
import sys

from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from dateutil.parser import parse
from datetime import timedelta
# from torch.utils.data import Dataset, DataLoader

import tqdm
from tqdm.notebook import trange


### Data Import

In [3]:
Path.cwd() # 여기 나오는 경로에 파일 넣어야됨!

WindowsPath('C:/workspaces/python')

In [4]:
TRAIN_DATASET = sorted([x for x in Path("data/HAI 2.0/training").glob("*.csv")])
TRAIN_DATASET

[WindowsPath('data/HAI 2.0/training/train1.csv'),
 WindowsPath('data/HAI 2.0/training/train2.csv'),
 WindowsPath('data/HAI 2.0/training/train3.csv')]

In [5]:
TEST_DATASET = sorted([x for x in Path("data/HAI 2.0/testing").glob("*.csv")])
TEST_DATASET

[WindowsPath('data/HAI 2.0/testing/test1.csv'),
 WindowsPath('data/HAI 2.0/testing/test2.csv'),
 WindowsPath('data/HAI 2.0/testing/test3.csv'),
 WindowsPath('data/HAI 2.0/testing/test4.csv')]

In [6]:
VALIDATION_DATASET = sorted([x for x in Path("data/HAI 2.0/validation").glob("*.csv")])
VALIDATION_DATASET

[WindowsPath('data/HAI 2.0/validation/validation.csv')]

#### Train set 병합

In [7]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x.as_posix()) for x in targets])

In [8]:
TRAIN_DF_RAW = dataframe_from_csvs(TRAIN_DATASET)
TRAIN_DF_RAW

Unnamed: 0,time,C01,C02,C03,C04,C05,C06,C07,C08,C09,...,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79
0,2020-07-11 00:00:00,395.19528,12,10,52.80456,-1.2648,-1.87531,779.59595,28.02645,10832.0,...,808.29620,0.0,1.36810,8.79882,35.43700,12.01782,305.03113,301.35992,33.6555,6.0951
1,2020-07-11 00:00:01,395.14420,12,10,52.78931,-1.3147,-1.88294,780.67328,28.02473,10984.0,...,819.16809,0.0,1.36810,8.78811,35.45227,12.01782,304.27161,297.43567,33.6555,5.9262
2,2020-07-11 00:00:02,395.14420,12,10,52.79694,-1.4032,-1.88294,780.06574,28.02817,11120.0,...,823.51697,0.0,1.36734,8.81787,35.45227,12.01782,303.89179,298.66534,33.6555,5.8101
3,2020-07-11 00:00:03,395.19528,12,10,52.79694,-1.6074,-1.88294,780.15265,28.02301,11256.0,...,823.95172,0.0,1.36734,8.87493,35.43700,12.01782,303.67474,298.06860,33.6555,5.7509
4,2020-07-11 00:00:04,395.34866,12,10,52.79694,-1.7811,-1.88294,781.83160,28.03595,11384.0,...,827.86560,0.0,1.36810,8.83838,35.45227,12.01782,303.22266,296.53137,33.6555,5.8547
5,2020-07-11 00:00:05,395.24640,12,10,52.79694,-1.8713,-1.88294,780.38776,28.02561,11512.0,...,829.60516,0.0,1.36810,8.86826,35.43700,12.01782,302.04718,296.83881,33.6555,6.0244
6,2020-07-11 00:00:06,395.34866,12,10,52.79694,-1.8651,-1.88294,783.65607,28.02301,11624.0,...,831.34473,0.0,1.35895,8.88958,35.43700,12.01782,301.52270,296.24207,33.6555,6.2372
7,2020-07-11 00:00:07,395.39975,12,10,52.79694,-1.7046,-1.88294,779.70721,28.02818,11736.0,...,821.34241,0.0,1.35971,8.91155,35.43700,12.01782,301.21527,295.68140,33.6555,6.4729
8,2020-07-11 00:00:08,395.34866,12,10,52.79694,-1.5038,-1.88294,779.65717,28.02385,11840.0,...,820.90759,0.0,1.35895,8.94309,35.43700,12.01782,300.72699,295.31976,33.6555,6.5049
9,2020-07-11 00:00:09,395.34866,12,10,52.79694,-1.3420,-1.87531,779.99420,28.02909,11936.0,...,821.77741,0.0,1.35895,8.93967,35.43700,12.01782,300.65466,294.88568,33.6555,6.3823


- Train set는 공격을 받지 않은 평상시의 데이터고, 시간=time, C01~C79는 학습 데이터셋에 있는 모든 센서/액추에이터 필드
- 정규화는 C01~C79 만을 해야한다.
- VALID_COLUMNS_IN_TRAIN_DATASET은 학습 데이터셋에 있는 모든 센서/액추에이터 필드를 담고 있다.
- 학습 데이터셋에 존재하지 않는 필드가 테스트 데이터셋에 존재하는 경우가 있다. 
- 학습 시 보지 못했던 필드에 대해서 테스트를 할 수 없으므로 학습 데이터셋을 기준으로 필드 이름을 얻는다.

In [9]:
TIMESTAMP_FIELD = "time"
IDSTAMP_FIELD = 'id'
ATTACK_FIELD = "attack"
VALID_COLUMNS_IN_TRAIN_DATASET = TRAIN_DF_RAW.columns.drop([TIMESTAMP_FIELD])
VALID_COLUMNS_IN_TRAIN_DATASET

Index(['C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10',
       'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
       'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29', 'C30',
       'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40',
       'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C49', 'C50',
       'C51', 'C52', 'C53', 'C54', 'C55', 'C56', 'C57', 'C58', 'C59', 'C60',
       'C61', 'C62', 'C63', 'C64', 'C65', 'C66', 'C67', 'C68', 'C69', 'C70',
       'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79'],
      dtype='object')

#### Train set normalization

In [10]:
TAG_MIN = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].min()
TAG_MAX = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].max()

In [11]:
TAG_MIN.head()

C01    355.98859
C02     12.00000
C03     10.00000
C04     46.17462
C05     -2.16030
dtype: float64

In [12]:
# 정규화 함수 생성 - MinMax Scaler
def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

#### Train set smoothing

- 센서에서 발생하는 noise를 smoothing시키기 위해 exponential weighted function을 통과시킴
- ewm : 보다 최근의 값에 가중치를 두면서 이동평균 계산
- EWM_t = X_t*EP + EWM_(t-1)*(1-EP), where EP(Exponential Percetage) = 2/(time+1)
-https://m.blog.naver.com/PostView.nhn?blogId=gracekang7&logNo=221232491635&proxyReferer=https:%2F%2Fwww.google.com%2F

In [13]:
TRAIN_DF = normalize(TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET]).ewm(alpha=0.9).mean()
TRAIN_DF.head(30)

Unnamed: 0,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,...,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79
0,0.378953,0.0,0.0,0.227071,0.37238,0.00023,0.386721,0.410567,0.784144,0.508049,...,0.584892,0.0,0.326835,0.254687,0.331076,0.916661,0.269393,0.265017,1.0,0.567254
1,0.378504,0.0,0.0,0.226596,0.353516,0.000161,0.399074,0.364415,0.794139,0.540538,...,0.592044,0.0,0.326835,0.254315,0.337223,0.916661,0.266791,0.251792,1.0,0.512135
2,0.378463,0.0,0.0,0.226789,0.318663,0.000154,0.393283,0.451729,0.803903,0.538802,...,0.595523,0.0,0.326387,0.255304,0.337777,0.916661,0.265266,0.254707,1.0,0.469622
3,0.378904,0.0,0.0,0.226808,0.238782,0.000154,0.393697,0.323289,0.813725,0.459532,...,0.596151,0.0,0.326343,0.257362,0.331746,0.916661,0.264379,0.253005,1.0,0.446285
4,0.380282,0.0,0.0,0.22681,0.165794,0.000154,0.412796,0.654203,0.823039,0.333541,...,0.598763,0.0,0.326786,0.256312,0.337229,0.916661,0.262757,0.247706,1.0,0.477489
5,0.37953,0.0,0.0,0.22681,0.124738,0.000154,0.398317,0.412616,0.832304,0.220924,...,0.600157,0.0,0.32683,0.257233,0.331692,0.916661,0.258607,0.248202,1.0,0.535436
6,0.380345,0.0,0.0,0.22681,0.122953,0.000154,0.433968,0.31939,0.840522,0.141019,...,0.601429,0.0,0.321441,0.258057,0.331138,0.916661,0.256413,0.24626,1.0,0.609982
7,0.38087,0.0,0.0,0.22681,0.182842,0.000154,0.392709,0.447405,0.848636,0.129432,...,0.595042,0.0,0.32135,0.258894,0.331083,0.916661,0.25515,0.244195,1.0,0.693586
8,0.380479,0.0,0.0,0.22681,0.26398,0.000154,0.388015,0.345183,0.856218,0.215276,...,0.59412,0.0,0.320893,0.260061,0.331077,0.916661,0.253368,0.242782,1.0,0.712285
9,0.380439,0.0,0.0,0.22681,0.332648,0.000223,0.391371,0.474158,0.863226,0.342822,...,0.594595,0.0,0.320847,0.26006,0.331077,0.916661,0.252944,0.241193,1.0,0.674545


#### data boundary check

In [14]:
# 결측, 0~1사이 값이 아닌게 있는지 점검
def boundary_check(df):
    x = np.array(df, dtype=np.float32)
    return np.any(x > 1.0), np.any(x < 0), np.any(np.isnan(x))

In [15]:
boundary_check(TRAIN_DF)

(False, False, False)

### RNN Model Fitting(GRU Cells)
- Train set에는 정상데이터만 있으므로 unsupervised learning 해야함
- 모델이 출력하는 예측값과 실제로 들어온 값의 차이가 크면 이상으로 간주함

#### Sliding Window

In [16]:
WINDOW_GIVEN = 0
WINDOW_SIZE = 1

In [33]:
class HaiDataset:
    # 생성자
    def __init__(self, timestamps, df, stride=1, attack=None):
        self.ts = np.array(timestamps)
        self.tag_values = np.array(df, dtype=np.float32)
        self.valid_idxs = []
        for L in trange(len(self.ts) - WINDOW_SIZE + 1):
            R = L + WINDOW_SIZE - 1
            t_diff = dateutil.parser.parse(self.ts[R]) - dateutil.parser.parse(self.ts[L])
            
            # 맨 처음 obs일때 동작 지정
            if t_diff == timedelta(seconds = WINDOW_SIZE -1): 
                self.valid_idxs.append(L)
        self.valid_idxs = np.array(self.valid_idxs, dtype=np.int32)[::stride]
        self.n_idxs = len(self.valid_idxs)
        print("# of valid windows : {}".format(self.n_idxs))
        
        if attacks is not None:
            self.attacks = np.array(attacks, dtype=np.float32)
            self.with_attack = True
        else:
            self.with_attack = False
    
    # 메서드 1
    def __len__(self):
        return self.n_idxs
    
    # 메서드 2
    def __getitem__(self, idx):
        i = self.valid_idxs[idx]
        last = i + WINDOW_SIZE - 1 # slide end point
        item = {"attack" : self.attacks[last]} if self.with_attack else {}
        item["ts"] = self.ts[last] # i + WINDOW_SIZE - 1
        time["given"] = self.tag_values[i : i + WINDOW_GIVEN] # 오류나면 여기 ndarray 객체 만들어보자(np.array)
        item["answer"] = self.tag_values[last]
        return item

In [39]:
HAI_DATASET_TRAIN = HaiDataset(TRAIN_DF_RAW[TIMESTAMP_FIELD], TRAIN_DF, stride=1)
HAI_DATASET_TRAIN[0]
"""
에러나면 anaconda prompt 가상환경 activate해서

conda install -c conda-forge ipywidgets
jupyter nbextension enable --py widgetsnbextension
"""

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html