## Import

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import os
import datetime
import pickle

from decimal import Decimal

## Info

In [5]:
event_days = {'CircuitBreaker' : [datetime.date(2020,3,13),
                                  datetime.date(2020,3,19)],
              'KoreanSAT' : [datetime.date(2010,11,18),
                             datetime.date(2011,11,10),
                             datetime.date(2012,11,8),
                             datetime.date(2013,11,7),
                             datetime.date(2014,11,13),
                             datetime.date(2015,11,12),
                             datetime.date(2016,11,17),
                             datetime.date(2017,11,16),
                             datetime.date(2017,11,23),
                             datetime.date(2018,11,15),
                             datetime.date(2019,11,14)]
                             }

## Data LOAD

In [20]:
with open('../data/processed/kospi200_preprocessed.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
df.head(5)

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
2010-02-16 09:01:00,20100216,901,207.55,207.65,207.5,207.6,207.5,3985.0
2010-02-16 09:02:00,20100216,902,207.6,207.65,207.25,207.55,207.5,5095.0
2010-02-16 09:03:00,20100216,903,207.55,207.8,207.5,207.6,207.5,2175.0
2010-02-16 09:04:00,20100216,904,207.55,207.85,207.55,207.8,207.5,1301.0
2010-02-16 09:05:00,20100216,905,207.8,208.15,207.8,208.05,207.5,3870.0


In [None]:
# 20100716 데이터를 삭제 
df = df[df.index.date != datetime.date(2010, 7, 16)]

## Nan을 삭제하지 않은 4버전

In [None]:
with open('../data/processed/kospi200_clean_version_Nan.pkl', 'wb') as f:
    pickle.dump(df, f)

## 보간하지 않은 버전 -- 2버전

In [35]:
n_ffill_df = df[~df.isnull().any(axis=1)]
n_ffill_df.isnull().sum()

date         0
time         0
open         0
high         0
low          0
close        0
prevClose    0
vol          0
dtype: int64

In [36]:
with open('../data/processed/kospi200_clean_version.pkl', 'wb') as f:
    pickle.dump(n_ffill_df, f)

## ffill한 버전 -- 3

In [22]:
# 서킷브레이크 데이터와 이외 데이터로 나눔 
circuit_breaker_df = df[df.index.normalize().isin(event_days['CircuitBreaker'])]
cleaned_df = df[~df.index.normalize().isin(event_days['CircuitBreaker'])]

  circuit_breaker_df = df[df.index.normalize().isin(event_days['CircuitBreaker'])]
  cleaned_df = df[~df.index.normalize().isin(event_days['CircuitBreaker'])]


In [23]:
# 결측치 채우기 
df_cleaned = cleaned_df.ffill()

In [None]:
df_cleaned.isnull().sum() # 결측치 제거가 제대로 됐는지 확인하기 

date         0
time         0
open         0
high         0
low          0
close        0
prevClose    0
vol          0
dtype: int64

In [28]:
total_df = pd.concat([df_cleaned, circuit_breaker_df], axis=0)
total_df.sort_index()
total_df.head(2)

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
2010-02-16 09:01:00,20100216,901,207.55,207.65,207.5,207.6,207.5,3985.0
2010-02-16 09:02:00,20100216,902,207.6,207.65,207.25,207.55,207.5,5095.0


In [None]:
total_df.isnull().sum() # 서킷 브레이크 30분 동안의 결측치 

date         58
time         58
open         58
high         58
low          58
close        58
prevClose    58
vol          58
dtype: int64

## ffill하지만 서킷브레이크 값 결측치 살리기 

In [39]:
with open('../data/processed/kospi200_ffill_clean_version_Nan.pkl', 'wb') as f:
    pickle.dump(total_df, f)

In [33]:
final_df = total_df[~total_df.isnull().any(axis=1)]
final_df.isnull().sum()

date         0
time         0
open         0
high         0
low          0
close        0
prevClose    0
vol          0
dtype: int64

In [34]:
with open('../data/processed/kospi200_ffill_clean_version.pkl', 'wb') as f:
    pickle.dump(final_df, f)