# Unsupervised outlier detection for Time series data using LSTM + AE
Feature : TMP,HMD,NO2,TVOC,CO2,NH3,PM2.5,PM10 (8 features)   
Time : 2020-10-25 ~ 2021-07-20

## 0. Setting

### 필요한 라이브러리 호출

In [221]:
from glob import glob
import os

import pandas as pd
import numpy as np
import datetime
import time

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False   # 한글 폰트 패치.

import warnings
warnings.filterwarnings('ignore')   # 경고문 처리.

from sklearn.preprocessing import StandardScaler

## 1. Data Load

### 데이터 불러오기 & 전처리

In [266]:
def load_dataset(path):
    feature_list = ['created','temperature','humidity','no2','sgp30_tvoc',
                    'sgp30_co2','nh3','pm_mass_2_5','pm_mass_10']
    new_feature_name = ['time','tmp','hmd', 'no2','tvoc',
                        'co2','nh3','pm_mass_2_5','pm_mass_10']
    
    df = pd.DataFrame(columns = new_feature_name)
    path = path
    flist = sorted(glob(path+'data/*.xlsx'))
    for filename in flist:
        print(filename)
        
        # 원하는 특성만 추출하기
        file = pd.read_excel(filename)
        file = file[feature_list]
        file = pd.DataFrame(file.to_numpy(),columns=new_feature_name)
        
        # 결측치 처리, 실수화
        for col in new_feature_name[1:]:
            file[col] = file[col].astype(str)
            try:
                file[col] = file[col].astype(float)
            except:
                file[col] = file[col].apply(lambda x : np.nan if x[:1] == "'" else x)
                file.fillna(method='bfill', inplace=True)
                file[col] = file[col].astype(float)
        
        # 데이터 병합
        df = pd.merge(df,file,how='outer')
    
    # 시간 순으로 정렬
    df = df.sort_values('time')

    return df

In [277]:
path = 'C:\myPyCode\AI_DATA/Outlier Detection/'
df = load_dataset(path)
df_data = df.iloc[:,1:]
df_date = df.iloc[:,0]

print(df_data.shape)
print(df_date.shape)
df.head()

C:\myPyCode\AI_DATA/Outlier Detection/data\G1.xlsx
C:\myPyCode\AI_DATA/Outlier Detection/data\G2.xlsx
C:\myPyCode\AI_DATA/Outlier Detection/data\G3.xlsx
(26011, 8)
(26011,)


Unnamed: 0,time,tmp,hmd,no2,tvoc,co2,nh3,pm_mass_2_5,pm_mass_10
9888,2020-10-25 15:48:18,25.76783,26.448109,152.0,369.0,611.0,65.151512,5.438039,5.438042
9887,2020-10-25 15:58:16,25.802683,26.474508,146.0,383.0,616.0,61.21212,5.008767,5.00877
9886,2020-10-25 16:08:13,25.759787,26.496584,161.0,376.0,631.0,62.42424,5.621708,5.621712
9885,2020-10-25 16:18:13,25.746382,26.515398,161.0,379.0,607.0,62.42424,5.675469,5.675473
9884,2020-10-25 16:28:12,25.703487,26.537476,184.0,384.0,607.0,69.696968,5.497287,5.547377


In [278]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26011 entries, 9888 to 16012
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   time         26011 non-null  datetime64[ns]
 1   tmp          26011 non-null  float64       
 2   hmd          26011 non-null  float64       
 3   no2          26011 non-null  float64       
 4   tvoc         26011 non-null  float64       
 5   co2          26011 non-null  float64       
 6   nh3          26011 non-null  float64       
 7   pm_mass_2_5  26011 non-null  float64       
 8   pm_mass_10   26011 non-null  float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 2.0 MB


In [279]:
df.describe()

Unnamed: 0,tmp,hmd,no2,tvoc,co2,nh3,pm_mass_2_5,pm_mass_10
count,26011.0,26011.0,26011.0,26011.0,26011.0,26011.0,26011.0,26011.0
mean,26.253754,30.89665,183.417708,657.653262,681.720887,75.23161,10.03276,10.22762
std,2.242109,12.745013,82.178214,889.828525,657.436487,56.855261,9.963495,10.703203
min,18.931358,9.483164,11.0,0.0,400.0,0.0,0.0,0.0
25%,24.966221,20.773549,137.0,301.0,400.0,42.424244,3.960481,3.985829
50%,25.719572,27.544119,161.0,488.0,595.0,57.272728,7.183887,7.223754
75%,27.084187,43.111069,198.0,826.0,854.0,86.060608,13.082449,13.160235
max,35.448811,71.808067,1167.0,60000.0,60000.0,690.30304,161.209671,176.457443


### 표준화 데이터셋 생성

In [280]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_data)
df_scaled = pd.DataFrame(df_scaled,columns=df_data.columns.to_list())
df_scaled

Unnamed: 0,tmp,hmd,no2,tvoc,co2,nh3,pm_mass_2_5,pm_mass_10
0,-0.216730,-0.349048,-0.382319,-0.324398,-0.107573,-0.177297,-0.461164,-0.447499
1,-0.201185,-0.346977,-0.455333,-0.308665,-0.099967,-0.246587,-0.504250,-0.487606
2,-0.220318,-0.345245,-0.272799,-0.316531,-0.077151,-0.225267,-0.442730,-0.430338
3,-0.226297,-0.343769,-0.272799,-0.313160,-0.113657,-0.225267,-0.437334,-0.425315
4,-0.245428,-0.342036,0.007086,-0.307541,-0.113657,-0.097348,-0.455218,-0.437283
...,...,...,...,...,...,...,...,...
26006,2.741575,0.653477,0.883247,-0.495221,-0.211007,-1.323238,-0.694301,-0.664523
26007,2.429483,0.556620,0.566856,-0.487354,-0.224697,-1.323238,-0.735240,-0.702632
26008,2.148480,0.678702,0.773727,-0.472744,-0.162332,-1.323238,-0.665982,-0.638161
26009,1.763446,0.519805,0.408660,-0.445772,0.099295,-1.317908,-0.544890,-0.525438


### 최종 데이터셋 : df, df_data, df_date, df_scaled

## 2. Data EDA