## Load Modules

In [None]:
# general
import warnings, sys, os
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

# data
import pandas as pd
import numpy as np
from datetime import datetime
import re

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# model
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.metrics import mean_squared_error as skmse

## 1 Data Preparing

### 1-1 Data Type, Header and brief Feature Engineering

- add Header
- transform data type : datetime
- FE, raw data -> price data : 일별 거래횟수를 카운트해서 병합

In [None]:
isRawdata = re.compile(r'.*_rawdata.txt')
isPrice = re.compile(r'.*_price.txt')

def dtCnvrt(x):
    return datetime(x.year, x.month, x.day)

priceCols = ['date', 'amount', 'OPEN', 'HIGH', 'LOW', 'CLOSE',]
rawNewCols =['TXID','date','Timestamp','From','To','amount',]
rawCols = ['date','amount']

writePath = './dataset/'
readPath = './dataset/origin/'
tokens=[token+'/' for token in os.listdir(readPath)]

for token in tokens:
    files = os.listdir(readPath+token)
    for file in files:
        
        # transform data type
        if re.match(isPrice, file):
            price = pd.read_csv(readPath+token+file, sep='\t')
            price['date'] = pd.to_datetime(price['DateTime'])
            price['date'] = price['date'].apply(dtCnvrt)
            price = price.rename(columns={'Volume':'amount'}, index=str)
            price.to_hdf(readPath+token+token[:-1]+'_price.h5', token[:-1]+'_price')
            
        # add header
        elif re.match(isRawdata, file):
            raw = pd.read_csv(readPath+token+file, sep='\t', header=None)
            raw = raw.rename(columns={origin : col for origin, col in zip(raw.columns, rawNewCols)}, index=str)
            raw.to_hdf(readPath+token+token[:-1]+'_raw.h5', token[:-1]+'_raw')
        else : continue
            
    # FE, raw data -> price data
    rawCnt = raw['amount'].groupby(raw['date']).agg('count')
    rawCnt = rawCnt.reset_index()
    rawCnt['date'] = pd.to_datetime(rawCnt['date'])
    rawCnt['date'] = rawCnt['date'].apply(dtCnvrt)
    
    # Save
    rawCnt = rawCnt.rename(columns={'amount':'count'}, index=str)
    f = pd.merge(price[priceCols], rawCnt, how='left', on='date')
    f = f.dropna()
    f.to_hdf(writePath+token[:-1]+'_price.h5', token[:-1]+'_price')

### 1-2 Shape, Type for Model(torch)

- 정답 90일, 훈련 180일, 총 270일 이상 기록되어 있는 데이터 추출
- input size에 맞춰 전처리
- model에 맞는 데이터형으로 변환

In [None]:
path = './dataset/'
tokens = os.listdir(path)
isFile = re.compile(r'.*_price.h5')
trainData = []
for token in tokens:
    if re.match(isFile, token):
        price = pd.read_hdf(path+token)
        if price.shape[0]>=270:
            locals()[token[:-9]]= price
            trainData.append(token[:-9])

cols = ['HIGH', 'CLOSE', 'OPEN', 'LOW']
device = torch.device('cuda')
for file in trainData:
    try :
        f = locals()[file]
        for col in cols+['amount', 'count']:
            f[col] = pd.to_numeric(f[col])
        a = f['amount'].values
        b = f[cols].values
        c = f['count'].values

        xa = np.array([[value for _ in range(4)] for value in a[90:270]])
        xc = np.array([[value for _ in range(4)] for value in c[90:270]])
        temp = []
        temp.append(b[90:270])
        temp.append(xa)
        temp.append(xc)
        x = np.array(temp)
        x = x.reshape(1,3,180,4)
        locals()[file+'_x'] = Variable(torch.tensor(x, dtype=torch.float32, device=device), requires_grad=True).cuda()
        locals()[file+'_y'] = Variable(torch.tensor(a[:90], dtype=torch.float32, device=device), requires_grad=True).cuda()
    except ValueError: 
        print(file)
        trainData.remove(file)

## 2 Modeling
### 2-1 Define Model

In [None]:
class CNN_GRU(nn.Module):
    def __init__(self):
        super(CNN_GRU, self).__init__()
        self.conv1 = nn.Conv2d(3, 256, 3, stride=1, padding=1).cuda()
        self.conv2 = nn.Conv2d(128, 32, 3, stride=1, padding=1).cuda()
        self.relu = nn.ReLU().cuda()
        self.maxpool = nn.MaxPool2d((2,2)).cuda()
        self.fc11 = nn.Linear(16,4).cuda()
        self.fc12 = nn.Linear(4,4).cuda()
        self.fc13 = nn.Linear(4,1).cuda()
        self.gru1 = nn.GRU(input_size=180,hidden_size=180,num_layers =100, dropout = 0.3).cuda()
        self.fc21 = nn.Linear(180,180).cuda()
        self.gru2 = nn.GRU(input_size=180,hidden_size=90,num_layers =100, dropout = 0.3).cuda()
        self.fc22 = nn.Linear(90,90).cuda()
        
    def forward(self, x):
        ret = self.relu(self.conv1(x))
        ret = self.maxpool(ret.transpose_(1,2))
        ret = self.relu(self.conv2(ret.transpose_(1,2)))
        ret = self.maxpool(ret.transpose_(1,2))
        ret = self.fc11(ret.reshape(180,16))
        for _ in range(8):
            ret = self.fc12(ret)
        ret = self.fc13(ret)
        ret, hn1 = self.gru1(ret.reshape(1,1,180))
        for _ in range(5):
            ret = self.fc21(ret.reshape(180,))
        ret, hn2 = self.gru2(ret.reshape(1,1,180))
        for _ in range(5):
            ret = self.fc22(ret.reshape(90,))
        return ret

### 2-2 Train

In [None]:
model = CNN_GRU()
adam = torch.optim.Adam(model.parameters(),lr=0.00001)
mse = nn.MSELoss()

epoches = 2001
writeEpoch = 100
for epoch in tqdm(range(epoches)):
    for file in trainData:
        x = locals()[file+'_x']
        y = locals()[file+'_y']
        adam.zero_grad()
        out = model(x)
        loss = torch.sqrt(mse(out, y))
        loss.backward(retain_graph=True)
        adam.step()
    if epoch%writeEpoch==0 :
        torch.save(model.state_dict(), './deepModel2/CNN_GRU_'+str(epoch)+'.pth')