# Unsupervised outlier detection for Time series data using LSTM + AE

Feature : CO, CO2, TVOC, PM Number   
Time : 2021-12-22 11:00:00 ~ 2022-01-12 10:30:00 (30min)

## 1. Data Load & Preprocessing

### 필요한 라이브러리 호출

In [10]:
import glob

import pandas as pd
import numpy as np
import datetime
import time

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False   # 한글 폰트 패치.

import warnings
warnings.filterwarnings('ignore')   # 경고문 처리.

### 데이터 불러오기 & 전처리

In [65]:
# 전처리 함수
def preprocessing(df):
    # 모든 값을 실수 값으로 변환
    df.iloc[:,1:] = df.iloc[:,1:].astype(float)
    # 결측치 처리
    df.fillna(method='bfill',inplace=True)
    df.fillna(method='ffill',inplace=True)
    # 시간 열을 datetime 타입으로 변환
    df['Time'] = pd.to_datetime(df['Time'])
    
    return df

# 데이터 로드 함수
def data_load(files):
    df_list=[]
    # 디렉토리로부터 csv파일 불러오기
    for i in range(len(files)):
        df = pd.read_csv(files[i])
        df_list.append(df)
    
    # 각 파일별로 전처리 후 데이터셋 구성
    dataset = preprocessing(df_list[0])
    if len(df_list) == 1:
        pass
    else:
        for j in range(len(df_list)-1):
            df = preprocessing(df_list[j+1])
            dataset = pd.merge(dataset,df,how='inner')
    
    return dataset

In [66]:
# 파일 경로 설정
dir_path = 'C:/myPyCode/AI_DATA/Outlier Detection/'
files = glob.glob(dir_path+'*.csv')
print('Total numbers of files :',len(files))
print(files)

Total numbers of files : 3
['C:/myPyCode/AI_DATA/Outlier Detection\\CO, CO2 2022-01-12 10_50_41.csv', 'C:/myPyCode/AI_DATA/Outlier Detection\\PM Number 2022-01-12 10_52_02.csv', 'C:/myPyCode/AI_DATA/Outlier Detection\\TVOC 2022-01-12 10_51_31.csv']


In [69]:
# 데이터 불러오기
dataset = data_load(files)
print(dataset.shape)
dataset

(1000, 5)


Unnamed: 0,Time,CO,CO2,PM 0.5,TVOC
0,2021-12-22 11:00:00,3531.0,1045.0,152.0,594.0
1,2021-12-22 12:00:00,3451.0,1146.0,139.0,629.0
2,2021-12-22 12:30:00,3262.0,831.0,96.0,481.0
3,2021-12-22 13:00:00,3300.0,633.0,92.0,465.0
4,2021-12-22 13:30:00,3744.0,727.0,99.0,403.0
...,...,...,...,...,...
995,2022-01-12 08:30:00,1989.0,400.0,75.0,0.0
996,2022-01-12 09:00:00,1492.0,400.0,72.0,21.0
997,2022-01-12 09:30:00,1851.0,400.0,224.0,99.0
998,2022-01-12 10:00:00,2194.0,400.0,418.0,238.0


In [68]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    1000 non-null   datetime64[ns]
 1   CO      1000 non-null   float64       
 2   CO2     1000 non-null   float64       
 3   PM 0.5  1000 non-null   float64       
 4   TVOC    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 46.9 KB


In [70]:
dataset.describe()

Unnamed: 0,CO,CO2,PM 0.5,TVOC
count,1000.0,1000.0,1000.0,1000.0
mean,2284.403,653.997,187.914,483.112
std,829.642689,367.959989,240.599026,531.593261
min,675.0,400.0,6.8,0.0
25%,1560.75,400.0,54.0,182.0
50%,2278.5,545.0,105.5,339.5
75%,2895.25,737.5,213.25,561.25
max,4485.0,2821.0,1619.0,3190.0
