In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy # Add Deepcopy for args

import seaborn as sns 
import matplotlib.pyplot as plt
import os, pickle, joblib, argparse

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.tsa.api import SimpleExpSmoothing

# pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
 
 
print(torch.__version__)
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (8, 6)

1.6.0+cpu
Populating the interactive namespace from numpy and matplotlib


In [2]:
# read file
raw_x_df = pd.read_csv('./data/train_x_df.csv')
raw_y_df = pd.read_csv('./data/train_y_df.csv')
print('Read files Complete!')


Read files Complete!


In [3]:
# ================================================= #
def df2d_to_array3d(df_2d):
    
    # 입력 받은 2차원 데이터 프레임을 3차원 numpy array로 변경하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    
    print('DataFrame to array, Complete!')
    
    return array_3d
# ================================================= #

def train_tset_spliter(arr):
    n = len(arr)
    num_features = arr.shape[2]
    
    train_arr = arr[0:int(n*0.8), :, :]
    val_arr = arr[int(n*0.8):, :, :]
    
    train_len = len(train_arr) 
    val_len = len(val_arr)
    
    print(
    f'''
    ======================================================
    Origin length is {n}, then total split length is {train_len, val_len}
    ======================================================
    train length is {train_arr.shape},
    val length is {val_arr.shape},
    num_features is ({num_features})
    '''
    )
    return train_arr, val_arr
# ================================================= #

def kbin_discretizer(input_array):

    kb = KBinsDiscretizer(n_bins=10, strategy='uniform', encode='ordinal')
    processed_data = np.zeros((input_array.shape[0], input_array.shape[1], 1))
    
    for i in range(input_array.shape[0]):
        # coin_index_export args : (input_array, coin_num)
        globals()['processing_array{}'.format(i)] = input_array[i,:,1]
        
        #globals()['outliery_array{}'.format(i)] = train_y_array[outlier[i],:,1]
        kb.fit(globals()['processing_array{}'.format(i)].reshape(input_array.shape[1],1))
        globals()['processed_fit{}'.format(i)] = kb.transform(globals()['processing_array{}'.format(i)].reshape(input_array.shape[1],1))
        
        #globals()['outliery_fit{}'.format(i)] = kb.transform(globals()['outliery_array{}'.format(i)].reshape(120,1))
        processed_data[i,:,:] = globals()['processed_fit{}'.format(i)]
        
    return processed_data

def outlier_detecter(raw_y_arr, outlier_criteria = 0.05):

    open_arr = raw_y_arr[:, :, 1] #open col is 1

    outlier_list = []
    openrange_list = []

    for idx, temp_arr in enumerate(open_arr):
    
        temp_min = temp_arr.min()
        temp_max = temp_arr.max()
        temp_arr_range = temp_max - temp_min
        openrange_list.append(temp_arr_range)

        if temp_arr_range > outlier_criteria:
            outlier_list.append(idx)
            print(f'{idx}번째 open series is outlier sample!')
            print(f'temp array range is {temp_arr_range:.3}\n')
            

    return outlier_list, np.array(openrange_list)

In [47]:
# df to array 
raw_x_arr = df2d_to_array3d(raw_x_df)
raw_y_arr = df2d_to_array3d(raw_y_df)


DataFrame to array, Complete!
DataFrame to array, Complete!


In [5]:
# ================================================= #
def simple_exponetial_smoothing_fory(arr, alpha=0.3):
    
    y_series = list()

    for temp_arr in arr:
        target_series = temp_arr[:, 1].reshape(-1) # open col is 1 index

        smoother = SimpleExpSmoothing(target_series, initialization_method="heuristic").fit(smoothing_level=alpha,optimized=False)
        smoothing_series = smoother.fittedvalues
        y_series.append(smoothing_series)
            
    return np.array(y_series)

# ================================================= #
def simple_exponetial_smoothing_forX(arr, alpha=0.3):
    
    # initialization
    sample_size = int(arr.shape[0])
    time_size = int(arr.shape[1])
    feature_size = int(arr.shape[2])
    
    # create empty array
    smoothing_arr = np.zeros((sample_size, time_size, feature_size - 1))

    for idx, temp_arr in enumerate(arr):
        for col in range(1, feature_size): # open col is 1 index
            if col < 5:

                temp_series = temp_arr[:, col].reshape(-1) 
                smoother = SimpleExpSmoothing(temp_series, initialization_method="heuristic").fit(smoothing_level=0.3,optimized=False)
                temp_smoothing_series = smoother.fittedvalues
                smoothing_arr[idx, :, col-1] = temp_smoothing_series

            else:
                
                pass_series = temp_arr[:, col].reshape(-1)
                smoothing_arr[idx, :, col-1] = pass_series

    return smoothing_arr
# ================================================= #

In [6]:
def plot_series(x_series, y_series):
    #입력 series와 출력 series를 연속적으로 연결하여 시각적으로 보여주는 코드 입니다.
    plt.plot(x_series, label = 'input_series')
    plt.plot(np.arange(len(x_series), len(x_series)+len(y_series)),
             y_series, label = 'output_series')
    plt.axhline(1, c = 'red')
    plt.legend()

In [7]:
# train set smoothing
train_x_arr = simple_exponetial_smoothing_forX(raw_x_arr)
train_y_arr = simple_exponetial_smoothing_fory(raw_y_arr)
print('simple exponetial smoothing Complete!!!')

simple exponetial smoothing Complete!!!


In [8]:
train_data = train_x_arr[0]
y_true = train_y_arr[0]

In [11]:
print(
    f'''
    train data shape is {train_data.shape},
    y true length is {(y_true.shape)},
'''
)


    train data shape is (1380, 9),
    y true length is (120, 9),



In [12]:
X_train = []
y_train = []

n_future = 120 # Number of days we want to predict into the future.
n_past = 360 # Number of past days we want to use to predict future.

for i in range(n_past, train_data.shape[0] - n_future + 1):
        
        X_train.append(train_data[i - n_past:i, :])
        y_train.append(train_data[i:i+n_future, :])
        
X_train, y_train = np.array(X_train), np.array(y_train)
print('X_train shape == {}.'.format(X_train.shape))
print('y_train shape == {}.'.format(y_train.shape))

X_train shape == (901, 360, 9).
y_train shape == (901, 120, 9).


In [14]:
# train & val set
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.1, shuffle=True)

print(
f'''
======================================================
Origin length is {len(X_train)}, then total split length is {len(train_X)} + {len(val_X)} = {len(train_X)+len(val_X)}
======================================================
train X length is {train_X.shape}, train y length is {train_y.shape},
val X length is {val_X.shape}, val y length is {val_y.shape},
'''
# test X length is {test_X.shape}, test y length is {test_y.shape}
)

In [16]:
print(f'train X length is {train_X.shape}, train y length is {train_y.shape}')

train X length is (810, 360, 9), train y length is (810, 120, 9)


In [19]:
# ====== initialization
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device is",args.device)

seed = 777
np.random.seed(seed)
torch.manual_seed(seed)


# ====== Model Capacity options ===== #
args.input_dim = 9
args.hidden_dim = 100
args.output_dim = 9
args.n_layers = 1
args.batch_size = 16
args.dropout = 0.2
args.use_bn = True

args.training_prediction = 'teacher_forcing'
args.teacher_forcing_ratio = 0.6


# ====== Dataset Generating options ====== #
args.x_frames = 360
args.y_frames = 120

# ====== Model training options ===== #
args.num_epoch = 50
args.learning_rate = 0.0001
args.L2_rate = 0.00001


device is cpu


In [18]:
class WindowGenerator():
    ''' Dataset Generate'''
    def __init__(self, train_X, train_y):
    
        self.X_arr = train_X
        self.y_arr = train_y

    def __len__(self):
        return len(self.X_arr)

    def __getitem__(self, idx):
        
        X = self.X_arr[idx, ]
        X = np.log(X + 1) - np.log(X[-1, :] + 1)

        y = self.y_arr[idx, ]
        
        return X, y

In [25]:
class Encoder(nn.Module):
    ''' Encodes time-serise sequence '''
    
    def __init__(self, input_dim, hidden_dim, num_layers):
        
        '''
        : param input_size:     the number of features in the input X
        : param hidden_size:    the number of features in the hidden state h
        : param num_layers:     number of recurrent layers (i.e., 2 means there are 2 stacked LSTMs)
        '''
        
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        
        # define LSTM layer
        self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
    
    def forward(self, X):
        
        '''
        : param x_input:               input of shape (seq_len, # in batch, input_size)
        : return lstm_out, hidden:     lstm_out gives all the hidden states in the sequence;
        :                              hidden gives the hidden state and cell state for the last
        :                              element in the sequence 
        '''
        
        rnn_out, self.hidden = self.rnn(X)
        return rnn_out, self.hidden
    
    def init_hidden(self, batch_size):
        
        '''
        initialize hidden state
        : param batch_size:    x_input.shape[1]
        : return:              zeroed hidden state and cell state 
        '''
        
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))

    
class Decoder(nn.Module):
    ''' Decodes hidden state output by encoder '''
    
    def __init__(self, output_dim, hidden_dim, num_layers, dropout, use_bn):
        
        '''
        : param output_dim:     the number of features in the input X
        : param hidden_dim:    the number of features in the hidden state h
        : param num_layers:     number of recurrent layers (i.e., 2 means there are
        :                       2 stacked LSTMs)
        '''
        
        super(Decoder, self).__init__()
        self.output_dim = output_dim 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.use_bn = use_bn
        
        self.rnn = nn.LSTM(self.output_dim, self.hidden_dim, self.num_layers)
        self.fc_out = self.regressor()
        
    def forward(self, x_input, encoder_hidden_states):
               
        '''        
        : param x_input:                    should be 2D (batch_size, input_size)
        : param encoder_hidden_states:      hidden states
        : return output, hidden:            output gives all the hidden states in the sequence;
        :                                   hidden gives the hidden state and cell state for the last
        :                                   element in the sequence 
 
        '''
        
#         print(f'decoder input size {x_input.shape} to {x_input.reshape(1, -1, self.output_dim).shape}')
        rnn_out, self.hidden = self.rnn(x_input.reshape(1, -1, self.output_dim), encoder_hidden_states)
        output = self.fc_out(rnn_out.squeeze(0)) 
        
        return output, self.hidden
    
    def regressor(self):

        layers = []
        if self.use_bn:
            layers.append(nn.BatchNorm1d(self.hidden_dim))
        layers.append(nn.Dropout(self.dropout))
        
        layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
        regressor = nn.Sequential(*layers)

        return regressor

class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device, target_len, training_prediction, teacher_forcing_ratio):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.target_len = target_len
        self.training_prediction = training_prediction
        self.teacher_forcing_ratio = teacher_forcing_ratio
        
        assert encoder.hidden_dim == decoder.hidden_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.num_layers == decoder.num_layers, \
            "Encoder and decoder must have equal number of layers!"

    
    def forward(self, X, y):
            
            if y is None:
                self.training_prediction = 'recursive'
                
            # initialize
            outputs = torch.zeros(self.target_len, X.shape[1], self.decoder.output_dim).to(self.device)
            encoder_hidden = self.encoder.init_hidden(X.shape[1])

            # encoder process
            encoder_output, encoder_hidden = self.encoder(X)  
            
            #decoder process
            ''' last X sequence, shape = [batch_size, open_index = 0] '''    
            decoder_input = X[-1, :, :]
            print(decoder_input.shape)
            decoder_hidden = encoder_hidden  
            

            # ======================================================================================#
            if self.training_prediction == 'recursive':
                # predict recursively
                for t in range(self.target_len): 
                    decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                    
                    outputs[t] = decoder_output 
                    decoder_input = decoder_output
                    
           # ======================================================================================#
            if self.training_prediction == 'teacher_forcing':
                # use teacher forcing
                if random.random() < self.teacher_forcing_ratio:
                    for t in range(self.target_len): 
                        decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                        outputs[t] = decoder_output
                        decoder_input = y[t, :, :]
                        
                # predict recursively 
                else:
                    for t in range(self.target_len): 
                        decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                        outputs[t] = decoder_output
                        decoder_input = decoder_output
                        
            # ======================================================================================#
            if self.training_prediction == 'mixed_teacher_forcing':
                # predict using mixed teacher forcing
                for t in range(self.target_len):

                    decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                    outputs[t] = decoder_output 
                                    
                    # predict with teacher forcing
                    if random.random() < self.teacher_forcing_ratio:
                        decoder_input = y[t, :, :]

                    # predict recursively 
                    else:
                        decoder_input = decoder_output

            # ======================================================================================#      
            return outputs


In [37]:
def metric(y_pred, y_true):
    perc_y_pred = np.exp(y_pred.cpu().detach().numpy())
    perc_y_true = np.exp(y_true.cpu().detach().numpy())
    mae = mean_absolute_error(perc_y_true, perc_y_pred, multioutput='raw_values')
    return mae*100

In [43]:
def train(model, partition, optimizer, loss_fn, args):
    ''' model training '''
    
    # data load
    trainloader = DataLoader(partition['train'],
                             batch_size = args.batch_size,
                             shuffle = True, drop_last = True)
    
    # model's mode setting
    model.train()
    model.zero_grad()
    optimizer.zero_grad()
    
    train_loss = 0.0
    train_acc = 0.0
    for i, (X, y) in enumerate(trainloader):
              
        X = X.transpose(0, 1).float().to(args.device)
        y_true = y.transpose(0, 1).float().to(args.device)

        
        # zero the gradient
        optimizer.zero_grad()

        y_pred = model(X, y_true)

        loss = loss_fn(y_true, y_pred)
        loss.backward()
        optimizer.step()
        
        # get the batch loss
        train_loss += loss.item()
        train_acc += metric(y_pred[:, :, 0], y_true[:, :, 0])[0]
        
    train_loss = train_loss / len(trainloader)
    train_acc = train_acc / len(trainloader)
    return model, train_loss, train_acc


def validate(model, partition, loss_fn, args):
    ''' model validate '''
    
    # data load
    valloader = DataLoader(partition['val'], 
                           batch_size = args.batch_size, 
                           shuffle = False, drop_last = True)
    
    # model's mode setting
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    # evaluate
    with torch.no_grad():
        for i, (X, y) in enumerate(valloader):
            
            X = X.transpose(0, 1).float().to(args.device)
            y_true = y.transpose(0, 1).float().to(args.device)
     
            # en-decoder outputs tensor 
            y_pred = model(X, None)
            y_pred = y_pred
            
            # compute the loss 
            loss = loss_fn(y_true, y_pred)

            # get the batch loss
            val_loss += loss.item()
            val_acc += metric(y_pred[:, :, 0], y_true[:, :, 0])[0]
            
    val_loss = val_loss / len(valloader)
    val_acc = val_acc * len(valloader)
    return val_loss, val_acc


def experiment(partition, args):
    # Encoder
    enc = Encoder(args.input_dim, args.hidden_dim, args.n_layers)
    
    # Decoder
    dec = Decoder(args.output_dim, args.hidden_dim, args.n_layers, args.dropout, args.use_bn)
    
    # Seq2Seq model
    model = Seq2Seq(enc, dec, args.device, args.y_frames, args.training_prediction, args.teacher_forcing_ratio)
    model.to(args.device)
    
#     model = LSTM(args.input_dim, args.hidden_dim, args.y_frames, args.n_layers, args.dropout, args.use_bn)
#     model.to(args.device)
    
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.L2_rate)
    
    # epoch-wise loss
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    for epoch in range(args.num_epoch):
        
        start_time = time.time()
        model, train_loss, train_acc= train(model, partition, optimizer, loss_fn, args)
        val_loss, val_acc = validate(model, partition, loss_fn, args)
        end_time = time.time()
        
        # add epoch loss
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        print('Epoch {}, Acc(train/val): {:2.2f}/{:2.2f}, Loss(train/val) {:2.5f}/{:2.5f}. Took {:2.2f} sec'.format(
            epoch+1, train_acc, val_acc, train_loss*10E3, val_loss*10E3, end_time-start_time))
    

    # ======= Add Result to Dictionary ======= #
    result = {}
    
    result['train_losses'] = train_losses
    result['val_losses'] = val_losses
    
    result['train_accs'] = train_accs
    result['val_accs'] = val_accs
    
    result['train_acc'] = train_acc
    result['val_acc'] = val_acc
    
    return vars(args), result, model

In [44]:
trainset = WindowGenerator(train_X, train_y)
valset = WindowGenerator(val_X, val_y)
# testset = WindowGenerator(test_X, test_y, x_frames = args.x_frames)

partition = {'train': trainset, 'val':valset}

In [46]:
print(args)
setting, result, model = experiment(partition, deepcopy(args))

Namespace(L2_rate=1e-05, batch_size=16, device='cpu', dropout=0.2, hidden_dim=100, input_dim=9, learning_rate=0.0001, n_layers=1, num_epoch=50, output_dim=9, teacher_forcing_ratio=0.6, training_prediction='teacher_forcing', use_bn=True, x_frames=360, y_frames=120)
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16, 9])
torch.Size([16,

KeyboardInterrupt: 

In [None]:
testloader = DataLoader(partition['train'], batch_size = 1, shuffle = True, drop_last = True)
model.eval()
# evaluate

with torch.no_grad():
    for i, (X, y) in enumerate(testloader):

        X = X.transpose(0, 1).float().to(args.device)
        y_true = y.transpose(0, 1).float().to(args.device)
                    
        
        # en-decoder outputs tensor 
        y_pred = model(X, None)
        y_pred = y_pred.squeeze(2)
        
        # y values to cpu
        y_true = y_true.cpu().detach().numpy().reshape(-1)
        y_pred = y_pred.cpu().detach().numpy().reshape(-1)
        
        print(y_true.shape, y_pred.shape)

        plt.plot(y_true, label = 'True series')
        plt.legend()
        plt.show()
        
        smoother = SimpleExpSmoothing(y_pred, initialization_method="heuristic").fit(smoothing_level=0.3,optimized=False)
        y_pred = smoother.fittedvalues

        plt.plot(y_pred, 'red', label = 'Prediction series')
#         plt.plot(.max(), '*')
        plt.legend()
        plt.show()
        
        print("=============================================")
        loss = mean_absolute_error(y_true, y_pred)
        mae =+ loss
        if i == 10:
            print(args, "\nSES 사용")
            print(f'mean absolute error * 10E5 is {(mae/5) * 10E5}')            
            break
