# Zipline Custome Bundle 생성
기존에 Zipline Bundle은 미국주식으로만 되어 있었다. 국내주식을 이용하시 위해서 한국주식으로 되어 있는 custom bundle을 생성해서 사용해보도록 하자
### FN Guide 데이터는 유료이므로 유료 구독한 사람들을 위해서 생성했다.

## Import & Settings

In [1]:
from pathlib import Path 
import warnings 
import pandas as pd 

In [2]:
warnings.filterwarnings('ignore')
DATA_DIR = Path('..', '..', 'data')
idx = pd.IndexSlice

In [3]:
def create_split_table():
    with pd.HDFStore('fnguide.h5') as store:
        store.put('kr/splits', pd.DataFrame(columns=['sid', 'effective_date', 'ratio'],
                                            data=[[1, pd.to_datetime('1990-01-01'), 1.0]]), format='t')

In [4]:
def load_prices():
    df = pd.read_hdf(DATA_DIR / 'fnguide_data.h5', 'fnguide/prices')

    return (df
            .rename(columns=lambda x: x.replace('adj_', ''))
            .unstack('ticker')
            .sort_index()
            .tz_localize('UTC')
            .stack('ticker')
            .swaplevel()
            .astype('int64')
           )

In [5]:
def load_symbols(tickers):
    df = pd.read_hdf(DATA_DIR / 'fnguide_data.h5', 'fnguide/stocks')
    df = df.append(pd.DataFrame([['kospi', 'KOSPI']], columns=['ticker','name']))
    df.set_index('ticker', inplace=True)
    df = df.reindex(index=tickers)
    df.reset_index(inplace=True)
    return (df
            .reset_index(drop=True)
            .reset_index()
            .rename(columns={'index': 'sid'})[['sid', 'ticker',	'name']]
           )



## 가격 데이터 불러오기

In [6]:
prices = load_prices()

In [7]:
date_index = prices.index.get_level_values('date').unique()

## 벤치마크 데이터 추가

In [8]:
import FinanceDataReader as fdr
kospi = fdr.DataReader('KS11')

In [9]:
kospi

Unnamed: 0_level_0,Close,Open,High,Low,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-05-01,123.60,123.60,123.60,123.60,3330000.0,0.0098
1981-05-02,123.50,123.50,123.50,123.50,2040000.0,-0.0008
1981-05-04,120.60,120.60,120.60,120.60,1930000.0,-0.0235
1981-05-06,120.70,120.70,120.70,120.70,1690000.0,0.0008
1981-05-07,119.30,119.30,119.30,119.30,1480000.0,-0.0116
...,...,...,...,...,...,...
2021-07-22,3250.21,3235.17,3253.75,3235.17,712980000.0,0.0107
2021-07-23,3254.42,3253.44,3264.01,3246.09,805050000.0,0.0013
2021-07-26,3224.95,3265.99,3265.99,3224.95,826070000.0,-0.0091
2021-07-27,3232.53,3244.43,3252.85,3230.75,991940000.0,0.0024


데이터 전처리 
- timezone UTC처리
- column 이름을 소문자로 변경

In [10]:
kospi['ticker'] = 'kospi'
kospi = kospi.tz_localize('UTC')
kospi_trading_date = kospi.index
kospi.reset_index(inplace=True)
kospi = kospi.rename(columns=lambda x: x.lower())

In [11]:
kospi = kospi.fillna(method='bfill')[['date', 'ticker', 'high',	'low', 'open', 'close', 'volume']]

In [12]:
kospi.set_index(['ticker', 'date'], inplace=True)

## 주식 장 거래 날짜 수정
- fnguide 데이터에서 휴장날인데 데이터가 온 것들이 있어서 이부분을 불러온 kospi 벤치마크 거래 날짜를 이용해서 클리닝 진행

In [13]:
kospi_trading_date = kospi.index.get_level_values('date')

In [14]:
prices_trading_date = prices.index.get_level_values('date').unique()

In [15]:
intersact_trading_date = kospi_trading_date.intersection(prices_trading_date)

In [16]:
kospi_trading_date.difference(prices_trading_date)

DatetimeIndex(['1981-05-01 00:00:00+00:00', '1981-05-02 00:00:00+00:00',
               '1981-05-04 00:00:00+00:00', '1981-05-06 00:00:00+00:00',
               '1981-05-07 00:00:00+00:00', '1981-05-08 00:00:00+00:00',
               '1981-05-09 00:00:00+00:00', '1981-05-12 00:00:00+00:00',
               '1981-05-13 00:00:00+00:00', '1981-05-14 00:00:00+00:00',
               ...
               '1998-10-17 00:00:00+00:00', '1998-10-24 00:00:00+00:00',
               '1998-10-31 00:00:00+00:00', '1998-11-07 00:00:00+00:00',
               '1998-11-14 00:00:00+00:00', '1998-11-21 00:00:00+00:00',
               '1998-11-28 00:00:00+00:00', '1998-12-05 00:00:00+00:00',
               '2021-07-27 00:00:00+00:00', '2021-07-28 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='date', length=2979, freq=None)

In [17]:
prices.sort_index(inplace=True)

In [18]:
prices = prices.unstack('ticker').reindex(index=intersact_trading_date).stack('ticker')

In [19]:
prices.index = prices.index.swaplevel()

In [20]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000010,1990-01-03 00:00:00+00:00,42794.0,39737.0,39737.0,42794.0,195060.0
000020,1990-01-03 00:00:00+00:00,1381.0,1381.0,1381.0,1381.0,10.0
000030,1990-01-03 00:00:00+00:00,1410887.0,1330265.0,1330265.0,1410887.0,208050.0
000040,1990-01-03 00:00:00+00:00,874502.0,812481.0,812481.0,874502.0,21760.0
000050,1990-01-03 00:00:00+00:00,1819.0,1819.0,1819.0,1819.0,0.0
...,...,...,...,...,...,...
380440,2021-07-26 00:00:00+00:00,2250.0,2220.0,2220.0,2245.0,1257100.0
383220,2021-07-26 00:00:00+00:00,626000.0,601000.0,621000.0,617000.0,72589.0
383800,2021-07-26 00:00:00+00:00,10800.0,10600.0,10750.0,10650.0,1434272.0
900140,2021-07-26 00:00:00+00:00,3580.0,3505.0,3560.0,3520.0,356175.0


In [21]:
prices.index.names = ['ticker', 'date']

In [22]:
kospi = kospi.loc[idx[:, intersact_trading_date], :]

### 가격 데이터와 벤치마크 데이터 연결

In [23]:
prices = pd.concat([prices, kospi]).sort_index()

In [24]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000010,1990-01-03 00:00:00+00:00,42794.00,39737.00,39737.00,42794.00,195060.0
000010,1990-01-04 00:00:00+00:00,44628.00,42182.00,42794.00,44628.00,415700.0
000010,1990-01-05 00:00:00+00:00,45239.00,43711.00,44628.00,44016.00,336870.0
000010,1990-01-08 00:00:00+00:00,44016.00,43099.00,43711.00,43711.00,214660.0
000010,1990-01-09 00:00:00+00:00,44016.00,43405.00,43711.00,43405.00,205280.0
...,...,...,...,...,...,...
kospi,2021-07-20 00:00:00+00:00,3234.80,3214.42,3223.24,3232.70,979670000.0
kospi,2021-07-21 00:00:00+00:00,3252.25,3215.91,3251.13,3215.91,973470.0
kospi,2021-07-22 00:00:00+00:00,3253.75,3235.17,3235.17,3250.21,712980000.0
kospi,2021-07-23 00:00:00+00:00,3264.01,3246.09,3253.44,3254.42,805050000.0


In [25]:
tickers = prices.index.get_level_values('ticker').unique()

## 주식 종목코드랑 이름 데이터 불러오기

In [26]:
symbols = load_symbols(tickers)

In [27]:
symbols

Unnamed: 0,sid,ticker,name
0,0,000010,조흥은행
1,1,000020,동화약품
2,2,000030,우리은행
3,3,000040,KR모터스
4,4,000050,경방
...,...,...,...
1157,1157,950010,평산차업 KDR
1158,1158,950070,중국고섬
1159,1159,950100,SBI모기지
1160,1160,950210,프레스티지바이오파마


## **중요**  거래 정지 데이터(단기 과열등등)가 있어서 전처리 
- 주식 개장 날짜는 예를 들어 1월 3일, 4일 ,5일인데 단기 과열로 거래 정지된 종목들은 1월 3일, 5일과 같이 거래 정지된 4일이 비어있는 경우가 있다. 
- 이부분은 차후 zipline에서 ingest할때 실거래 일이랑 주식 가격데이터 거래일이랑 비교하므로 가격데이터 거래일이 거래정지되서 비어있으면 안된다. 따라서 ffill을 이용해서 앞의 가격으로 결측치 처리해주고 거래정지 이므로 volume은 0으로 넣어서 거래는 안됐다는 것을 전처리 해줬다.


In [28]:
def cleaning_data(ticker):
    stock_data = prices.loc[ticker]
    min_date = stock_data.index.min()
    max_date = stock_data.index.max()
    retrading_days = intersact_trading_date[(intersact_trading_date >= min_date) & (intersact_trading_date <= max_date)]
    stock_data = (stock_data
            .asfreq('1d')
            .reindex(index=retrading_days)
            )
    ohlc = stock_data[['high', 'low', 'open', 'close']].fillna(method='ffill')
    v = stock_data[['volume']].fillna(0)
    return pd.concat([ohlc, v], axis=1)

## 데이터들 저장

In [29]:
for sid, symbol in symbols.set_index('sid').ticker.items():
    print(symbol)
    p = cleaning_data(symbol)
    p.to_hdf('fnguide.h5', 'kr/{}'.format(sid), format='t')


000010
000020
000030
000040
000050
000060
000070
000080
000090
000100
000110
000120
000130
000140
000150
000160
000170
000180
000200
000210
000220
000230
000240
000270
000280
000300
000310
000320
000330
000360
000370
000390
000400
000420
000430
000450
000470
000480
000490
000500
000510
000520
000540
000570
000590
000600
000610
000640
000650
000660
000670
000680
000700
000720
000730
000760
000810
000830
000840
000850
000860
000870
000880
000890
000900
000910
000930
000950
000970
000990
001020
001030
001040
001060
001070
001080
001090
001120
001130
001140
001150
001160
001170
001190
001200
001210
001230
001250
001270
001280
001290
001300
001340
001350
001360
001370
001380
001390
001400
001420
001430
001440
001450
001460
001470
001500
001510
001520
001530
001550
001560
001570
001580
001600
001610
001620
001630
001670
001680
001690
001700
001720
001740
001750
001770
001780
001790
001800
001820
001830
001880
001910
001920
001930
001940
001950
001980
002000
002010
002020
002030
002050
002070

In [30]:
symbols.to_hdf('fnguide.h5', 'kr/equities', format='t')

In [31]:
create_split_table()

In [32]:
prices.to_hdf('fnguide.h5', 'fnguide/prices/cleaning', format='t')

In [2]:
with pd.HDFStore('fnguide.h5') as store:
        print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: fnguide.h5
/fnguide/prices/cleaning            frame_table  (typ->appendable_multi,nrows->5454200,ncols->7,indexers->[index],dc->[date,ticker])
/kr/0                               frame_table  (typ->appendable,nrows->3550,ncols->5,indexers->[index],dc->[])                    
/kr/1                               frame_table  (typ->appendable,nrows->7776,ncols->5,indexers->[index],dc->[])                    
/kr/10                              frame_table  (typ->appendable,nrows->3751,ncols->5,indexers->[index],dc->[])                    
/kr/100                             frame_table  (typ->appendable,nrows->7488,ncols->5,indexers->[index],dc->[])                    
/kr/1000                            frame_table  (typ->appendable,nrows->3328,ncols->5,indexers->[index],dc->[])                    
/kr/1001                            frame_table  (typ->appendable,nrows->3323,ncols->5,indexers->[index],dc->[])                    
/kr/1002 

## Zipline Root 폴더 찾기
- 컴퓨터마다 다르지만 제 컴퓨터에서는 C:\Users\PC\.zipline 이었다.

파일 이동
./zipline에 있는 데이터 들을 복사해서 백업해둔다.
여기 폴더에 있는 fnguide_kr_stocks.py와 extension.py을 이동한다. custom_data라는 폴더를 만든다.
fnguide.h5 데이터를 custom_data 폴더 안으로 넣는다.

## 최종 구조 
.zipline 폴더 위치


    |-extension.py
    |-fnguide_kr_stocks.py
    |-custom_data
        |-fnguide.h5

In [3]:
with pd.HDFStore('fnguide.h5') as store:
    stocks = store['kr/equities']

In [6]:
stocks.head(50)

Unnamed: 0,sid,ticker,name
0,0,10,조흥은행
1,1,20,동화약품
2,2,30,우리은행
3,3,40,KR모터스
4,4,50,경방
5,5,60,메리츠화재
6,6,70,삼양홀딩스
7,7,80,하이트진로
8,8,90,두산상사
9,9,100,유한양행
