## Simple RNN
- dataset : https://www.kaggle.com/datasets/iveeaten3223times/massive-yahoo-finance-dataset/
- 연속형(시계열, 문장 등) 데이터 처리에 유리

### 데이터 관련

In [1]:
# 데이터 로드
import pandas as pd
# dfStock = pd.read_csv('/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv')
dfStockAll = pd.read_csv('/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv')
# dfStock.head(3)

In [2]:
# 특정 회사 시계열 가져오기

selCom = 'AAPL'
dfStock = dfStockAll[dfStockAll['Company'] == selCom].copy()

### 데이터 전처리

In [3]:
# 날짜를 dateitme 변환
dfStock['Date'] = pd.to_datetime(dfStock['Date'], utc=True)
dfStock['Date'].dtype

datetime64[ns, UTC]

In [4]:
# dfStock.info()

In [5]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dfStock[['Open', 'High', 'Low', 'Close']] = scaler.fit_transform(dfStock[['Open', 'High', 'Low', 'Close']])
dfStock.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 05:00:00+00:00,-1.598199,-1.611253,-1.611061,-1.615618,167080000,0.0,0.0,AAPL
458,2018-11-30 05:00:00+00:00,-1.61031,-1.623753,-1.614516,-1.620572,158126000,0.0,0.0,AAPL
916,2018-12-03 05:00:00+00:00,-1.589,-1.600423,-1.592958,-1.588704,163210000,0.0,0.0,AAPL


In [6]:
# 시계열 데이터 생성
import numpy as np
import torch

sequence_length = 5 # 5일 단

from tqdm import tqdm

def create_sequences(data, seq_length):
    xs = []
    ys = []
    # for i in range(len(data) - seq_length):
    for i in tqdm(range(len(data) - seq_length), desc='Generating Sequences'):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys) # features, label

features, label = create_sequences(dfStock['Close'].values, sequence_length)
features.shape, label.shape


# def create_sequences(data, seq_length):
#     xs = []
#     ys = []
#     # data를 numpy 배열로 변환
#     data_array = data.values
    
#     for i in tqdm(range(len(data_array) - seq_length), desc='Generating Sequences'):
#         x = data_array[i:i+seq_length]
#         y = data_array[i+seq_length]
#         xs.append(x)
#         ys.append(y)
    
#     return np.array(xs), np.array(ys)


Generating Sequences: 100%|██████████| 1253/1253 [00:00<00:00, 1217387.75it/s]


((1253, 5), (1253,))

In [7]:
# 텐서로 변환
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(-1)

label_tensor = torch.tensor(label, dtype=torch.float32).unsqueeze(-1)

features_tensor.shape, label_tensor.shape

(torch.Size([1253, 5, 1]), torch.Size([1253, 1]))

### 모델 관련

In [8]:
# RNN 모델 정의
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

input_size = 1 # Row 단위 갯수
hidden_size = 5 # 은닉층 갯수
output_size = 1 

model = SimpleRNN(input_size, hidden_size, output_size)
model

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [9]:
# 모델 학습
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

epochs = 100

for epoch in tqdm(range(epochs), desc='%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'):
    model.train()
    outputs = model(features_tensor)
    optimizer.zero_grad()
    loss = criterion(outputs, label_tensor) # Error Rate
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0 : # 10회에 1번 출력
        print(f'Epoch [{epoch+1} / {epochs}], Loss : {loss.item():.5f}')

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%:  87%|████████▋ | 87/100 [00:00<00:00, 278.79it/s]

Epoch [10 / 100], Loss : 0.54460
Epoch [20 / 100], Loss : 0.10599
Epoch [30 / 100], Loss : 0.03367
Epoch [40 / 100], Loss : 0.02318
Epoch [50 / 100], Loss : 0.01802
Epoch [60 / 100], Loss : 0.00952
Epoch [70 / 100], Loss : 0.00731
Epoch [80 / 100], Loss : 0.00610
Epoch [90 / 100], Loss : 0.00502


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%: 100%|██████████| 100/100 [00:00<00:00, 236.32it/s]

Epoch [100 / 100], Loss : 0.00435





In [10]:
model.state_dict()

OrderedDict([('rnn.weight_ih_l0',
              tensor([[-0.6148],
                      [-0.1985],
                      [ 0.4659],
                      [-0.2230],
                      [ 0.1555]])),
             ('rnn.weight_hh_l0',
              tensor([[-0.0086, -0.2849, -0.2264,  0.0088, -0.1651],
                      [-0.0652,  0.4410, -0.3904,  0.6278,  0.2228],
                      [-0.1677,  0.0452,  0.4396,  0.1392,  0.5979],
                      [ 0.3211,  0.2931, -0.3857,  0.2118,  0.3912],
                      [-0.2197,  0.2903,  0.2204,  0.6185, -0.4114]])),
             ('rnn.bias_ih_l0',
              tensor([ 0.1577, -0.0856,  0.0236,  0.1585,  0.4550])),
             ('rnn.bias_hh_l0',
              tensor([ 0.3669,  0.0918, -0.0698,  0.3423,  0.5219])),
             ('fc.weight',
              tensor([[-0.6481, -0.5865,  0.6862, -0.1350, -0.0336]])),
             ('fc.bias', tensor([0.0582]))])

### 평가 관련

In [11]:
model.eval()

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [12]:
with torch.no_grad():
    predicted = model(features_tensor[:5])
predicted, label_tensor[:5]

(tensor([[-1.5485],
         [-1.5627],
         [-1.5658],
         [-1.5683],
         [-1.5683]]),
 tensor([[-1.6721],
         [-1.6664],
         [-1.6714],
         [-1.6690],
         [-1.6595]]))