# fbprophet 모델을 활용한 인공지능 비트 트레이더 경진대회 베이스라인 코드

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !pip install statsmodels==0.11.1
# !pip install fbprophet

# 1. Library Import

In [3]:
!pip install plotly



In [4]:
import numpy as np
import pandas as pd
import gc
import math
import os.path
import time
import matplotlib.pyplot as plt
from datetime import timedelta, datetime
from dateutil import parser
from tqdm import tqdm
import copy
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from fbprophet import Prophet
import warnings
import datetime
from plotly import tools 
import plotly.offline as offline 
import plotly.graph_objs as go 
warnings.filterwarnings("ignore")

# 2. 데이터 불러오기

## 1) read_csv

In [5]:
data_path = os.getenv('HOME') + '/dacon/ModuWay/baseline/data' # 경로 다름! 변경해줘야함
# data_path = os.getenv('HOME') + '/ModuWay/data'
train_x_df = pd.read_csv(data_path  + "/train_x_df.csv")
train_y_df = pd.read_csv(data_path  + "/train_y_df.csv")
test_x_df = pd.read_csv(data_path  + "/test_x_df.csv")

* sample_id : 개별 샘플의 인덱스
* time : x_df는 0분 ~ 1379분, y_df는 0분 ~ 119분의 값을 갖습니다. 동일한 샘플 내 시간 정보
* coin_index : 10가지 종류의 코인에 대한 비식별화 인덱스 (0 ~9)
* open : open price
* high : high price
* low : low price
* close : close price
* volume : 거래량
* quote_av : quote asset volume
* trades : 거래 건 수
* tb_base_av : taker buy base asset volume
* tb_quote_av : taker buy quote asset volume

In [6]:
train_x_df

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.010004,1.010004,1.009612,1.010004,8.382875e+05,43160.632812,451.157288,7.326834e+05,37725.183594
1,0,1,7,1.009808,1.009808,1.009808,1.009808,1.622420e+05,8352.220703,39.231071,0.000000e+00,0.000000
2,0,2,7,1.009808,1.010200,1.009808,1.010200,1.664967e+04,857.377808,58.846603,1.664967e+04,857.377808
3,0,3,7,1.010200,1.011181,1.010200,1.011181,2.586971e+06,133310.343750,431.541779,2.189147e+06,112811.046875
4,0,4,7,1.010985,1.010985,1.010200,1.010200,1.129996e+06,58216.867188,176.539810,0.000000e+00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
10159555,7361,1375,8,1.000668,1.001669,1.000626,1.001502,9.180907e+00,2203.059082,2.245034,6.229020e+00,1494.727417
10159556,7361,1376,8,1.001627,1.001920,1.000960,1.001294,7.963097e+00,1911.151611,2.211651,3.056139e+00,733.490601
10159557,7361,1377,8,1.001294,1.001461,1.000584,1.000668,3.849893e+00,923.610718,1.260224,2.284546e+00,548.042297
10159558,7361,1378,8,1.000709,1.000751,1.000042,1.000042,1.337402e+00,320.624756,0.826239,5.164965e-01,123.819839


In [7]:
# create 7 mins simple moving average column
train_x_df['7_SMA'] = train_x_df['close'].rolling(window = 7, min_periods=1).mean()
# create 25 mins simple moving average column
train_x_df['25_SMA'] = train_x_df['close'].rolling(window = 25,  min_periods=1).mean()
# display few rows
train_x_df

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,7_SMA,25_SMA
0,0,0,7,1.010004,1.010004,1.009612,1.010004,8.382875e+05,43160.632812,451.157288,7.326834e+05,37725.183594,1.010004,1.010004
1,0,1,7,1.009808,1.009808,1.009808,1.009808,1.622420e+05,8352.220703,39.231071,0.000000e+00,0.000000,1.009906,1.009906
2,0,2,7,1.009808,1.010200,1.009808,1.010200,1.664967e+04,857.377808,58.846603,1.664967e+04,857.377808,1.010004,1.010004
3,0,3,7,1.010200,1.011181,1.010200,1.011181,2.586971e+06,133310.343750,431.541779,2.189147e+06,112811.046875,1.010298,1.010298
4,0,4,7,1.010985,1.010985,1.010200,1.010200,1.129996e+06,58216.867188,176.539810,0.000000e+00,0.000000,1.010279,1.010279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10159555,7361,1375,8,1.000668,1.001669,1.000626,1.001502,9.180907e+00,2203.059082,2.245034,6.229020e+00,1494.727417,0.999350,0.998212
10159556,7361,1376,8,1.001627,1.001920,1.000960,1.001294,7.963097e+00,1911.151611,2.211651,3.056139e+00,733.490601,0.999815,0.998289
10159557,7361,1377,8,1.001294,1.001461,1.000584,1.000668,3.849893e+00,923.610718,1.260224,2.284546e+00,548.042297,1.000101,0.998314
10159558,7361,1378,8,1.000709,1.000751,1.000042,1.000042,1.337402e+00,320.624756,0.826239,5.164965e-01,123.819839,1.000364,0.998307


이삭님 구현하고 싶은 거: sample_id별로 macd 즉, 평균을 구했어야함. 근데 지금 코드에서는 단순히 전체 종가의 평균을 구했음 => 코드 수정 필요

In [9]:
len(train_x_df['sample_id'].unique().tolist())

7362

In [10]:
sample_id_lst = train_x_df['sample_id'].unique().tolist()

In [11]:
train_x_df['sample_id'].value_counts()

0       1380
4919    1380
4913    1380
4912    1380
4915    1380
        ... 
2451    1380
2448    1380
2449    1380
2446    1380
7360    1380
Name: sample_id, Length: 7362, dtype: int64

In [12]:
train_x_df[train_x_df['sample_id']==0]['close'].mean()

1.000142535869626

In [13]:
!pip install tqdm



In [24]:
df = train_x_df

In [25]:
df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,7_SMA,25_SMA,macd
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594,1.010004,1.010004,0
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0,1.009906,1.009906,0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808,1.010004,1.010004,0
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875,1.010298,1.010298,0
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0,1.010279,1.010279,0


In [26]:
df['macd'] = 0 
df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,7_SMA,25_SMA,macd
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594,1.010004,1.010004,0
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0,1.009906,1.009906,0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808,1.010004,1.010004,0
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875,1.010298,1.010298,0
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0,1.010279,1.010279,0


In [57]:
close_df = df[['sample_id', 'close']]

In [54]:
tmp_df = df[df['sample_id'] == 0]['close']
tmp_df.head()

0    1.010004
1    1.009808
2    1.010200
3    1.011181
4    1.010200
Name: close, dtype: float64

In [44]:
ma_12 = tmp_df['close'].ewm(span=12).mean()
ma_26 = tmp_df['close'].ewm(span=26).mean()

In [47]:
df['macd'] = 0
df['macd'] = ma_12 - ma_26
df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,7_SMA,25_SMA,macd,ma_12
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594,1.010004,1.010004,0.0,0
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0,1.009906,1.009906,-4e-06,0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808,1.010004,1.010004,7e-06,0
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875,1.010298,1.010298,4.7e-05,0
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0,1.010279,1.010279,2.9e-05,0


In [35]:
df['macd'].map(lambda x: 2) # lambda 기능 확인 
df

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,7_SMA,25_SMA,macd,ma_12
0,0,0,7,1.010004,1.010004,1.009612,1.010004,8.382875e+05,43160.632812,451.157288,7.326834e+05,37725.183594,1.010004,1.010004,1,0
1,0,1,7,1.009808,1.009808,1.009808,1.009808,1.622420e+05,8352.220703,39.231071,0.000000e+00,0.000000,1.009906,1.009906,1,0
2,0,2,7,1.009808,1.010200,1.009808,1.010200,1.664967e+04,857.377808,58.846603,1.664967e+04,857.377808,1.010004,1.010004,1,0
3,0,3,7,1.010200,1.011181,1.010200,1.011181,2.586971e+06,133310.343750,431.541779,2.189147e+06,112811.046875,1.010298,1.010298,1,0
4,0,4,7,1.010985,1.010985,1.010200,1.010200,1.129996e+06,58216.867188,176.539810,0.000000e+00,0.000000,1.010279,1.010279,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10159555,7361,1375,8,1.000668,1.001669,1.000626,1.001502,9.180907e+00,2203.059082,2.245034,6.229020e+00,1494.727417,0.999350,0.998212,1,0
10159556,7361,1376,8,1.001627,1.001920,1.000960,1.001294,7.963097e+00,1911.151611,2.211651,3.056139e+00,733.490601,0.999815,0.998289,1,0
10159557,7361,1377,8,1.001294,1.001461,1.000584,1.000668,3.849893e+00,923.610718,1.260224,2.284546e+00,548.042297,1.000101,0.998314,1,0
10159558,7361,1378,8,1.000709,1.000751,1.000042,1.000042,1.337402e+00,320.624756,0.826239,5.164965e-01,123.819839,1.000364,0.998307,1,0


In [36]:
df['ma_12'] = 0 
for sample_id_i in tqdm(sample_id_lst): # tqdm으로 진행바 표시 
    df['ma_12'] = df['ma_12'].apply(lambda x: train_x_df[train_x_df['sample_id']==sample_id_i]['close'].ewm(span=12).mean()) # lambda 기능 확인 

  0%|          | 0/7362 [08:29<?, ?it/s]


KeyboardInterrupt: 

In [61]:
close_df['ma_12'] = 0
close_df['ma_26'] = 0
close_df['macd'] = 0
close_df['macds'] = 0
close_df['macdo'] = 0
close_df

Unnamed: 0,sample_id,close,macd,macds,macdo,ma_12,ma_26
0,0,1.010004,0,0,0,0,0
1,0,1.009808,0,0,0,0,0
2,0,1.010200,0,0,0,0,0
3,0,1.011181,0,0,0,0,0
4,0,1.010200,0,0,0,0,0
...,...,...,...,...,...,...,...
10159555,7361,1.001502,0,0,0,0,0
10159556,7361,1.001294,0,0,0,0,0
10159557,7361,1.000668,0,0,0,0,0
10159558,7361,1.000042,0,0,0,0,0


In [67]:
group_close_df = close_df.groupby('sample_id')
group_close_df.head()

Unnamed: 0,sample_id,close,macd,macds,macdo,ma_12,ma_26
0,0,1.010004,0.000000,0.000000,0.000000,,
1,0,1.009808,-0.000004,-0.000002,-0.000002,,
2,0,1.010200,0.000007,0.000001,0.000005,,
3,0,1.011181,0.000047,0.000017,0.000030,,
4,0,1.010200,0.000029,0.000020,0.000008,,
...,...,...,...,...,...,...,...
10158180,7361,0.998456,,,,,
10158181,7361,0.998623,,,,,
10158182,7361,0.997872,,,,,
10158183,7361,0.998414,,,,,


In [None]:
for sample_id_i in tqdm(sample_id_lst): # tqdm으로 진행바 표시 
    # MACD 관련 수식 
    close_df.ma_12 = close_df[close_df['sample_id']==sample_id_i]['close'].ewm(span=12).mean() # 단기(12) EMA(지수이동평균) 


 81%|████████▏ | 5994/7362 [22:21<04:28,  5.09it/s] 

In [68]:
# 나중에 함수로 정리(train_x_df, train_y_df, test_x_df, test_y_df)
from tqdm import tqdm 
import time

if os.path.isfile(data_path + '/train_x_macd_df.csv'):
    print(f'train_x_df의 macd 계산을 완료했습니다.')
    pass 

else:
    for sample_id_i in tqdm(sample_id_lst): # tqdm으로 진행바 표시 
        # MACD 관련 수식 
        close_df.ma_12 = close_df[close_df['sample_id']==sample_id_i]['close'].ewm(span=12).mean() # 단기(12) EMA(지수이동평균) 
        close_df.ma_26 = close_df[close_df['sample_id']==sample_id_i]['close'].ewm(span=26).mean() # 장기(26) EMA 
        close_df.macd = ma_12 - ma_26 # MACD 
        close_df.macds = macd.ewm(span=9).mean() # Signal 
        close_df.macdo = macd - macds # Oscillator 
        
close_df.to_csv(data_path + '/train_x_macd_df.csv')

  1%|          | 88/7362 [02:18<3:11:04,  1.58s/it]


KeyboardInterrupt: 

In [None]:
# create 7 mins simple moving average column
train_y_df['7_SMA'] = train_y_df['close'].rolling(window = 7, min_periods=1).mean()
# create 25 mins simple moving average column
train_y_df['25_SMA'] = train_y_df['close'].rolling(window = 25,  min_periods=1).mean()

train_y_df = get_macd(train_y_df) 

# 데이터 확인 
train_y_df

In [None]:
test_x_df

In [None]:
# create 7 mins simple moving average column
test_x_df['7_SMA'] = test_x_df['close'].rolling(window = 7, min_periods=1).mean()
# create 25 mins simple moving average column
test_x_df['25_SMA'] = test_x_df['close'].rolling(window = 25,  min_periods=1).mean()

test_x_df = get_macd(test_x_df) 

# 데이터 확인 
test_x_df

## 2) numpy arrary로 변환하기

In [None]:
def df2d_to_array3d(df_2d):
    # 입력 받은 2차원 데이터 프레임을 3차원 numpy array로 변경하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    sample_index = df_2d.sample_id.value_counts().index
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

In [None]:
train_x_array = df2d_to_array3d(train_x_df)
train_y_array = df2d_to_array3d(train_y_df)
test_x_array = df2d_to_array3d(test_x_df)

In [None]:
print(f'''
train_x_array {train_x_array.shape}
train_y_array {train_y_array.shape}
test_x_array {test_x_array.shape}
''')

## MACD 시각화 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import numpy as np

In [None]:
idx = 1000
index = train_x_df[train_x_df['sample_id']==idx].time.tolist()
type(index), len(index)

In [None]:
# loc 쓰기 귀찮아서 데이터 프레임으로.. 그냥 .. 때려..넣기? => 나중에 수정
sample_x_df = pd.DataFrame()
sample_x_df = train_x_df[['sample_id','7_SMA', '25_SMA','macd','macdo','macds']]
sample_x_df.head()

In [None]:
sma7_lst = sample_x_df['7_SMA'].loc[sample_x_df['sample_id']==idx].tolist()
sma25_lst = sample_x_df['25_SMA'].loc[sample_x_df['sample_id']==idx].tolist()

In [None]:
print(len(sma25_lst) == len(sma7_lst)) # 차원 일치 여부 확인 
print(type(index) == type(sma7_lst)) # 형 일치 여부 확인 

In [None]:
index[0:2], sma7_lst[0:2] # 값 확인

In [None]:
# Main chart - SMA_7 and SMA_25
fig, ax = plt.subplots(figsize=(12,5))

ax.set_title('SMA_7 v.s. SMA_25', fontsize=20)

ax.plot(index, sma7_lst, 'b', lw=1, label='SMA_7')
ax.plot(index, sma25_lst, 'r', lw=1, label='SMA_25')
ax.legend(loc='best')

fig.tight_layout()

In [None]:
macd_lst = sample_x_df['macd'].loc[sample_x_df['sample_id']==idx].tolist()
macds_lst = sample_x_df['macds'].loc[sample_x_df['sample_id']==idx].tolist()

In [None]:
# index chart - MACD and MACDs
fig, ax = plt.subplots(figsize=(12,5))

ax.set_title('MACD v.s. MACD Sinal', fontsize=20)

ax.plot(index, macd_lst, 'b', lw=1, label='macd')
ax.plot(index, macds_lst, 'r', lw=1, label='macds')
ax.legend(loc='best')

fig.tight_layout()

In [None]:
macdo_lst = sample_x_df['macdo'].loc[sample_x_df['sample_id']==idx].tolist()

In [None]:
# index chart - MACDo
fig, ax = plt.subplots(figsize=(12,5))

ax.set_title('MACD Oscilator', fontsize=20)

ax.bar(index, [0 if i > 0 else i for i in macdo_lst], lw=1, label='macdo(+)')
ax.bar(index, [0 if i < 0 else i for i in macdo_lst], lw=1, label='macdo(-)')
ax.legend(loc='best')

fig.tight_layout()

## MACD