### library setttings

In [1]:
# others
import pandas as pd
import numpy as np
import argparse
import datetime
from copy import deepcopy # Add Deepcopy for args
import pickle 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim


print(torch.__version__)
%matplotlib inline
%pylab inline
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False


# read file
file_path = './data/merged_data.h5'
data = pd.read_hdf(file_path)


1.6.0+cpu
Populating the interactive namespace from numpy and matplotlib


In [2]:
df = data.drop(columns = ['sample_id', 'time', 'coin_index'])
column_indices = {name : i for i, name in enumerate(df.columns)}
print(column_indices)

{'open': 0, 'high': 1, 'low': 2, 'close': 3, 'volume': 4, 'quote_av': 5, 'trades': 6, 'tb_base_av': 7, 'tb_quote_av': 8}


In [3]:
def df2d_to_array3d(df_2d):
    
    # 입력 받은 2차원 데이터 프레임을 3차원 numpy array로 변경하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    
    return array_3d

raw_array = df2d_to_array3d(data)
print(f'raw array shape is {raw_array.shape}')

raw array shape is (1208, 1500, 10)


In [4]:
def time_split(input_array, split_size = 6):

    # origin size define
    index_size = input_array.shape[0]
    origin_time_size = input_array.shape[1]
    variable_size = input_array.shape[2]

    # new array size define
    new_time_size = int(origin_time_size/split_size) # 1380 / 6
    new_array = np.zeros((index_size, new_time_size, variable_size))

    for idx in range(index_size):
        for time_idx in range(new_time_size):
            

            first_time_idx = time_idx * split_size
            last_time_idx = ((time_idx+1) * split_size) -1

            new_array[idx, time_idx, 0] = input_array[idx, first_time_idx, 0] #coin_num
            new_array[idx, time_idx, 1] = input_array[idx, first_time_idx, 1] #open
            
            new_array[idx, time_idx, 2] = np.max(input_array[idx, first_time_idx:last_time_idx, 2]) #high
            new_array[idx, time_idx, 3] = np.min(input_array[idx, first_time_idx:last_time_idx, 3]) #low

            new_array[idx, time_idx, 4] = input_array[idx, last_time_idx, 4] #close

            new_array[idx, time_idx, 5] = np.sum(input_array[idx, first_time_idx:last_time_idx, 5]) #etc
            new_array[idx, time_idx, 6] = np.sum(input_array[idx, first_time_idx:last_time_idx, 6]) #etc
            new_array[idx, time_idx, 7] = np.sum(input_array[idx, first_time_idx:last_time_idx, 7]) #etc
            new_array[idx, time_idx, 8] = np.sum(input_array[idx, first_time_idx:last_time_idx, 8]) #etc
            new_array[idx, time_idx, 9] = np.sum(input_array[idx, first_time_idx:last_time_idx, 9]) #etc

    return new_array

split_array = time_split(raw_array, split_size = 6)
print(f'split array shape is {split_array.shape}')

split array shape is (1208, 250, 10)


In [4]:
def train_val_test_spliter(arr):
    
    
    n = len(arr)
    num_features = arr.shape[2] - 1
    
    train_arr = arr[0:int(n*0.7), :, 1:]
    val_arr = arr[int(n*0.7):int(n*0.9), :, 1:]
    test_arr = arr[int(n*0.9):, : , 1:]

    
    n2 = len(train_arr) + len(val_arr) + len(test_arr)
    
    print(
    f'''
    ======================================================
    Origin length is {n}, then total split length is {n2}
    ======================================================
    train length is {train_arr.shape},
    val length is {val_arr.shape},
    test length is {test_arr.shape},
    num_features is ({num_features})
    '''
    )
    
    return train_arr, val_arr, test_arr, num_features

train_arr, val_arr, test_arr, num_features = train_val_test_spliter(raw_array)


    Origin length is 1208, then total split length is 1208
    train length is (845, 1500, 9),
    val length is (242, 1500, 9),
    test length is (121, 1500, 9),
    num_features is (9)
    


In [5]:
train_mean = train_arr.mean(axis=(0, 1))
train_std = train_arr.std(axis=(0, 1))

ntrain_arr = (train_arr - train_mean) / train_std
nval_arr = (val_arr - train_mean) / train_std
ntest_arr = (test_arr - train_mean) / train_std

# dataset partitioning

partition = {'train': ntrain_arr, 'val':nval_arr, 'test':ntest_arr}

In [7]:
trainloader = DataLoader(partition['train'], batch_size = args.batch_size, shuffle = True, drop_last = True)

for i, X in enumerate(trainloader):
    print(X[:, :1380, :].transpose(0,1).shape)
    print(X[:, 1380:, 0].shape)
    
    break

torch.Size([1380, 64, 9])
torch.Size([64, 120])


In [6]:
# ====== initialization
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device is",args.device)

seed = 777
np.random.seed(seed)
torch.manual_seed(seed)


# ====== Model Capacity options ===== #
args.input_dim = 9
args.hidden_dim = 300
args.output_dim = 1
args.n_layers = 2
args.batch_size = 64
args.dropout = 0.2
args.use_bn = True

# ====== Dataset Generating options ====== #
args.x_frames = 1380
args.y_frames = 120

# ====== Model training options ===== #
args.num_epoch = 30
args.learning_rate = 0.01
args.L2_rate = 0.0001



device is cpu


In [8]:
class LSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, batch_size, dropout, use_bn):
        super(LSTM, self).__init__()
        self.input_dim = input_dim 
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.batch_size = batch_size
        self.dropout = dropout
        self.use_bn = use_bn 
        
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        self.hidden = self.init_hidden()
        self.regressor = self.make_regressor()
        
    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
    
    def make_regressor(self):
        layers = []
        if self.use_bn:
            layers.append(nn.BatchNorm1d(self.hidden_dim))
        layers.append(nn.Dropout(self.dropout))
        
        layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
        regressor = nn.Sequential(*layers)
        return regressor
    
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y_pred = self.regressor(lstm_out[-1].view(self.batch_size, -1))
        return y_pred

In [37]:
def train(model, partition, optimizer, loss_fn, args):
    ''' model training '''
   
    # data load
    trainloader = DataLoader(partition['train'],
                             batch_size = args.batch_size,
                             shuffle = True, drop_last = True)
    
    # model's mode setting
    model.train()
    model.zero_grad()
    
    train_loss = 0.0
    
    for i, data in enumerate(trainloader):
    
        X = data[:, :1380, :].transpose(0, 1).float().to(args.device)
        y_true = data[:, 1380:, 0].float().to(args.device)
        
        print(X.shape, y_true.shape)
        
        # zero the gradient
        optimizer.zero_grad()
        model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]

        y_pred = model(X)
        print(y_pred.shape)
        
        loss = loss_fn(y_true.view(-1), y_pred.view(-1))
        loss.backward()
        optimizer.step()
        
        # get the batch loss
        train_loss += loss.item()
        
    train_loss = train_loss / len(trainloader)
    
    return model, train_loss

In [38]:
def validate(model, partition, loss_fn, args):
    ''' model validate '''
    
    # data load
    valloader = DataLoader(partition['val'], 
                           batch_size = args.batch_size, 
                           shuffle = False, drop_last = True)
    
    # model's mode setting
    model.eval()
    val_loss = 0.0
    
    # evaluate
    with torch.no_grad():
        for i, data in enumerate(valloader):
            
            X = data[:, :1380, :].transpose(0, 1).float().to(args.device)
            y_true = data[:, 1380:, 0].float().to(args.device)
            model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]
            
            # en-decoder outputs tensor 
            y_pred = model(X)
            # compute the loss 
            loss = loss_fn(y_true.view(-1), y_pred.view(-1))

            # get the batch loss
            val_loss += loss.item()
            
    val_loss = val_loss / len(valloader)
    return val_loss

In [39]:
def test(model, partition, scaler, args):
    ''' model test '''
    
    # data load
    testloader = DataLoader(partition['test'], 
                            batch_size = args.batch_size, 
                            shuffle = False, drop_last = True)
    
    # model's mode setting
    model.eval()
    test_mae = 0.0
    
    # evaluate
    with torch.no_grad():
        for i, data in enumerate(testloader):
            
            X = data[:, :1380, :].transpose(0, 1).float().to(args.device)
            y_true = data[:,1380:, 0].float().to(args.device)
            model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]
            
            # en-decoder outputs tensor 
            y_pred = model(X)
            
#             # y values to cpu
#             y_true = y_true.cpu().detach().numpy()
#             y_pred = y_pred.cpu().detach().numpy()

            # get the batch loss
            test_mae += mean_absolute_error(y_true, y_pred)
#             score = r2_score(y_pred = y_pred.transpose(), y_true = y_true.transpose(), multioutput = 'uniform_average')
#             score_list.append(score)
                        
    test_mae /= len(testloader)
#     score /= len(testloader) 
    return test_mae, item_loss_list

In [40]:
def experiment(partition, args):


    model = LSTM(args.input_dim, args.hidden_dim, args.y_frames, args.n_layers, args.batch_size, args.dropout, args.use_bn)
    model.to(args.device)
    
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.L2_rate)
    
    # epoch-wise loss
    train_losses = []
    val_losses = []

    for epoch in range(args.num_epoch):
        
        start_time = time.time()
        model, train_loss = train(model, partition, optimizer, loss_fn, args)
        val_loss = validate(model, partition, loss_fn, args)
        end_time = time.time()
        
        # add epoch loss
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        print('Epoch {},Loss(train/val) {:.3f}/{:.3f}. Took {:.2f} sec'.format(epoch+1, train_loss, val_loss, end_time-start_time))
    
    # test part
    # test_mae, item_loss_list = test(model, partition, scaler, args)
    
    # ======= Add Result to Dictionary ======= #
    result = {}
    
    result['train_losses'] = train_losses #epoch 수에 의존
    result['val_losses'] = val_losses 
    
    #result['test_mae'] = test_mae.round(3).item()
    
    # result['r2'] = np.array(score_list).mean().round(3)
    # item_loss = np.array(item_loss_list).mean(axis=0).mean(axis=0).astype(int)
    # item_loss = list([int(x) for x in item_loss])
    # result['item_loss'] = item_loss
     
    return vars(args), result

In [41]:
setting, result = experiment(partition, args)

torch.Size([1380, 64, 9]) torch.Size([64, 120])
torch.Size([64, 120])


KeyboardInterrupt: 