In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy # Add Deepcopy for args

import seaborn as sns 
import matplotlib.pyplot as plt
import os, pickle, joblib, argparse

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from statsmodels.tsa.api import SimpleExpSmoothing
from sklearn.preprocessing import LabelEncoder

# pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import _LRScheduler
 
print(torch.__version__)
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (8, 6)

1.6.0+cpu
Populating the interactive namespace from numpy and matplotlib


In [2]:
# read file
raw_x_df = pd.read_csv('./data/train_x_df.csv')
raw_y_df = pd.read_csv('./data/train_y_df.csv')

print('Read files Complete!')

Read files Complete!


In [13]:
def coindata_preprocessor(coin_num_x_df, coin_num_y_df):
    
    # y dataframe time value에 1380 씩 adding
    coin_num_y_df.time = coin_num_y_df.time.copy() + 1380

    # x,y df merge하고 sample_id와 time 순으로 sorting
    merged_df = pd.concat([coin_num_x_df, coin_num_y_df])
    merged_df = merged_df.sort_values(by = ['sample_id','time']).reset_index(drop=True)

    merged_df.to_hdf('./data/merged_allcoin.h5',  key = 'merged_df')


In [14]:
coindata_preprocessor(raw_x_df, raw_y_df)

In [3]:
# ================================================= #
def df2d_to_array3d(df_2d):
    
    # 입력 받은 2차원 데이터 프레임을 3차원 numpy array로 변경하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    
    print('DataFrame to array, Complete!')
    
    return array_3d

In [4]:
# ================================================= #

def simple_exponetial_smoothing_fory(arr, alpha=0.3):
    
    y_series = list()

    for temp_arr in arr:
        target_series = temp_arr[:, 1].reshape(-1) # open col is 1 index

        smoother = SimpleExpSmoothing(target_series, initialization_method="heuristic").fit(smoothing_level=alpha,optimized=False)
        smoothing_series = smoother.fittedvalues

        y_series.append(smoothing_series)
            
    return np.array(y_series)

# ================================================= #

def simple_exponetial_smoothing_forX(arr, alpha=0.3):
    
    # initialization
    sample_size = int(arr.shape[0])
    time_size = int(arr.shape[1])
    feature_size = int(arr.shape[2])
    
    # create empty array
    smoothing_arr = np.zeros((sample_size, time_size, feature_size - 1))

    for idx, temp_arr in enumerate(arr):
        for col in range(1, feature_size): # open col is 1 index
            if col < 5:

                temp_series = temp_arr[:, col].reshape(-1) 
                smoother = SimpleExpSmoothing(temp_series, initialization_method="heuristic").fit(smoothing_level=0.3,optimized=False)
                temp_smoothing_series = smoother.fittedvalues
                smoothing_arr[idx, :, col-1] = temp_smoothing_series

            else:
                
                pass_series = temp_arr[:, col].reshape(-1)
                smoothing_arr[idx, :, col-1] = pass_series

    return smoothing_arr

# ================================================= #


def kbindiscreter(input_array):
  kb = KBinsDiscretizer(n_bins=100, strategy='uniform', encode='ordinal')
  processed_data = np.zeros((input_array.shape[0], input_array.shape[1], 1))
  for i in range(input_array.shape[0]):
    # coin_index_export args : (input_array, coin_num)
    processing_array = input_array[i,:,1]
    #globals()['outliery_array{}'.format(i)] = train_y_array[outlier[i],:,1]
    kb.fit(processing_array.reshape(input_array.shape[1],1))
    processed_fit = kb.transform(processing_array.reshape(input_array.shape[1],1))
    #globals()['outliery_fit{}'.format(i)] = kb.transform(globals()['outliery_array{}'.format(i)].reshape(120,1))
    processed_data[i,:,:] = processed_fit
  return processed_data


In [5]:
# df to array 
raw_x_arr = df2d_to_array3d(raw_x_df)
raw_y_arr = df2d_to_array3d(raw_y_df)
# # train set smoothing
# train_x_arr = simple_exponetial_smoothing_forX(train_x_arr)


DataFrame to array, Complete!
DataFrame to array, Complete!


In [6]:

xarray = kbindiscreter(raw_x_arr)
yarray = kbindiscreter(raw_y_arr)

In [10]:
def one_hot_encoder(X_train):
    new_arr = np.zeros((X_train.shape[0],X_train.shape[1],100))

    for i in range(X_train.shape[0]):
        for j in range(X_train.shape[1]):
            for k in range(100):
                if X_train[i,j,:] == k:
                    new_arr[i,j,k] = 1
    
    return new_arr

In [11]:
X_train = one_hot_encoder(xarray)

KeyboardInterrupt: 

In [None]:
X_train.shape

In [9]:
yarray.shape

(7661, 120, 1)

In [12]:
# train & val set
train_X, val_X, train_y, val_y = train_test_split(raw_x_arr[:, :, 1:], y_train, test_size = 0.1, shuffle=False)

print(
f'''
======================================================
Origin length is {len(raw_x_arr)}, then total split length is {len(raw_x_arr)} + {len(val_X)} = {len(train_X)+len(val_X)}
======================================================
train X length is {train_X.shape}, train y length is {train_y.shape},
val X length is {val_X.shape}, val y length is {val_y.shape},
'''
# test X length is {test_X.shape}, test y length is {test_y.shape}
)


Origin length is 7661, then total split length is 7661 + 767 = 7661
train X length is (6894, 1380, 9), train y length is (6894,),
val X length is (767, 1380, 9), val y length is (767,),



In [14]:
def one_hot_encoder(train_y):
    y_one_hot = torch.zeros(train_y.shape[0], 5)
    train_y = torch.LongTensor(train_y)
    train_y = y_one_hot.scatter_(1, train_y.unsqueeze(1), 1)

    return train_y

# train_y = one_hot_encoder(train_y)
# val_y = one_hot_encoder(val_y)

In [15]:
# ====== initialization
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device is",args.device)

seed = 777
np.random.seed(seed)
torch.manual_seed(seed)

# ====== Model Capacity options ===== #
args.input_dim = 9
args.hidden_dim = 100
args.output_dim = 9
args.n_layers = 1
args.batch_size = 16
args.dropout = 0.2
args.use_bn = True

args.x_frames = 1380

# ====== Model training options ===== #
args.num_epoch = 20
args.learning_rate = 0.0001
args.L2_rate = 0.0001


device is cpu


In [None]:
class WindowGenerator():
    ''' Dataset Generate'''
    def __init__(self, input_width, label_width, stride, data_arr, column_indices = column_indices,
                 shfit = None, label_columns=None):
    
        # Store the raw data
        self.data_arr = data_arr
        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = column_indices
                
        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = 1
        if shfit is not None:
            self.shift = shfit
        self.stride = stride
        
        self.label_start = self.input_width + self.shift
        self.total_window_size = self.label_start + self.label_width
        
        # input, label indices
        self.input_slice = slice(0, self.input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
        
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
        
        self.X_arr, self.y_arr = self.split_windows()
        
    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size -1}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): open'
        ])

    def split_windows(self):

        X, y = list(), list()
        sample_length = int(self.data_arr.shape[0])
        split_length = int((self.data_arr.shape[1] - self.total_window_size)/self.stride) + 1
        
        for temp_id in range(sample_length):
            for i in range(split_length):
                
                X.append(self.data_arr[temp_id, (i*self.stride) : (i*self.stride)+self.input_width])
                y.append(self.data_arr[temp_id, (i*self.stride)+self.label_start : (i*self.stride)+self.total_window_size])

        return np.array(X), np.array(y)

    def __len__(self):
        return len(self.X_arr)

    def __getitem__(self, idx):
        
        X = self.X_arr[idx, :, :]
        y = self.y_arr[idx, :, :]

        return X, y

In [22]:
class WindowGenerator(TensorDataset):
    ''' Dataset Generate'''
    def __init__(self, X_arr, y_arr, x_frames):
    
        self.X_arr = X_arr
        self.y_arr = y_arr
        self.x_frames = x_frames

    def __len__(self):
        return len(self.y_arr)

    def __getitem__(self, idx):
        
        X = self.X_arr[idx, -self.x_frames:, :]
        X = np.log(X + 1) - np.log(X[-1, :] + 1) 


        y = self.y_arr[idx]
        return X, y

In [23]:
class LSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout, use_bn):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.dropout = dropout
        self.use_bn = use_bn 
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, batch_first =True)

        self.fc = self.make_regressor()
        
    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))
    
    def make_regressor(self):
        layers = []
        if self.use_bn:
            layers.append(nn.BatchNorm1d(self.hidden_dim))
        layers.append(nn.Dropout(self.dropout))
        
        layers.append(nn.Linear(self.hidden_dim, 200))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(200, self.output_dim))
        regressor = nn.Sequential(*layers)
        return regressor
    
    def forward(self, X):
        lstm_out, self.hidden = self.lstm(X)
        y_pred = self.fc(lstm_out[:, -1, :])
        return y_pred

In [62]:
def train(model, partition, optimizer, loss_fn, args):
    ''' model training '''
   
    # data load
    trainloader = DataLoader(partition['train'],
                             batch_size = args.batch_size,
                             shuffle = True, drop_last = True)
    
    # model's mode setting
    model.train()
    model.zero_grad()
    optimizer.zero_grad()
    
    train_loss = 0.0
    for i, (X, y) in enumerate(trainloader):
    
        X = X.float().to(args.device)
        y_true = y.long().to(args.device)
        
#         print(X.shape, y_true.shape)
        
        # zero the gradient
        model.zero_grad()
        optimizer.zero_grad()
        model.hidden = model.init_hidden(X.shape[1])

        y_pred = model(X)
#         print(y_pred.shape)

        loss = loss_fn(y_pred, y_true)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss = train_loss / len(trainloader)
    return model, train_loss


def validate(model, partition, loss_fn, args):
    ''' model validate '''
    
    # data load
    valloader = DataLoader(partition['val'], 
                           batch_size = args.batch_size, 
                           shuffle = False, drop_last = True)
    
    # model's mode setting
    model.eval()
    val_loss = 0.0
    correct, total = 0, 0
    # evaluate
    with torch.no_grad():
        for i, (X, y) in enumerate(valloader):
            
            X = X.to(args.device)
            y_true = y.to(args.device)
            
            model.hidden = model.init_hidden(X.shape[1])
            # en-decoder outputs tensor 
            y_pred = model(X)
            
            # compute the loss
            loss = loss_fn(y_pred, y_true)
            val_loss += loss.item()
            
            preds = F.log_softmax(y_pred, dim=1).argmax(dim=1)
            total += y_true.size(0)
            correct += (preds == y_val).sum().item()
            
    val_loss = val_loss / len(valloader)
    val_acc = correct / total
    return val_loss, val_acc


In [63]:
def experiment(partition, args):

    model = LSTM(args.input_dim, args.hidden_dim, args.output_dim, args.n_layers, args.dropout, args.use_bn)
    model.to(args.device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate, weight_decay=args.L2_rate)
    
    # epoch-wise loss
    train_losses = []
    val_losses = []
    val_accs = []
    
    for epoch in range(args.num_epoch):
        
        start_time = time.time()
        model, train_loss = train(model, partition, optimizer, loss_fn, args)
        val_loss, val_acc = validate(model, partition, loss_fn, args)
        end_time = time.time()
        
        # add epoch loss
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        print('Epoch {}, Acc(train/val): {:2.2f}/{:2.2f}, Loss(train/val) {:2.5f}/{:2.5f}. Took {:2.2f} sec'.format(
            epoch+1, val_acc, train_loss, val_loss, end_time-start_time))
    

    # ======= Add Result to Dictionary ======= #
    result = {}
    
    result['train_losses'] = train_losses
    result['val_losses'] = val_losses
        
    return vars(args), result, model

In [64]:
# train_X, val_X, train_y, val_y = train_test_split(raw_x_arr[:, :, 1:], y_train, test_size = 0.1, shuffle=False)
trainset = WindowGenerator(train_X, train_y, args.x_frames)
valset = WindowGenerator(val_X, val_y, args.x_frames)

partition = {'train': trainset, 'val':valset}

In [65]:
print(args)
setting, result, model = experiment(partition, deepcopy(args))

Namespace(L2_rate=0.0001, batch_size=16, device='cpu', dropout=0.2, hidden_dim=100, input_dim=9, learning_rate=0.0001, n_layers=1, num_epoch=20, output_dim=9, use_bn=True, x_frames=1380)


KeyboardInterrupt: 

In [None]:
testloader = DataLoader(partition['train'], batch_size = 1, shuffle = True, drop_last = True)
model.eval()
# evaluate

with torch.no_grad():
    for i, (X, y) in enumerate(testloader):

        X = X.transpose(0, 1).float().to(args.device)
        y_true = y.float().to(args.device)
        model.hidden = model.init_hidden(X.shape[1])

        # en-decoder outputs tensor 
        y_pred = model(X)
        
        # y values to cpu
        y_true = y_true.cpu().detach().numpy().reshape(-1)
        y_pred = y_pred.cpu().detach().numpy().reshape(-1)
        
        print(y_true.shape, y_pred.shape)

        plt.plot(y_true, label = 'True series')
        plt.plot(y_pred, '-', label = 'Prediction series')
#         plt.plot(.max(), '*')
        plt.legend()
        plt.show()

        loss = mean_absolute_error(y_true, y_pred)
        mae =+ loss
        if i == 10:
            print(args, "\nSES 사용")
            print(f'mean absolute error * 10E5 is {(mae/5) * 10E5}')            
            break
