## simple RNN
- dataset : massive-yahoo-finance-dataset
- 연속형(시계열, 문장 등) 데이터 처리에 유리

### 데이터 처리

In [9]:
# 데이터 로드
import pandas as pd
file_path = f'/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv'
stock_df = pd.read_csv(file_path)
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL
3,2018-11-29 00:00:00-05:00,83.749496,84.499496,82.616501,83.678497,132264000,0.0,0.0,AMZN
4,2018-11-29 00:00:00-05:00,39.692784,40.064904,38.735195,39.037853,54917200,0.04,0.0,NVDA


In [10]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602962 entries, 0 to 602961
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Date          602962 non-null  object 
 1   Open          602962 non-null  float64
 2   High          602962 non-null  float64
 3   Low           602962 non-null  float64
 4   Close         602962 non-null  float64
 5   Volume        602962 non-null  int64  
 6   Dividends     602962 non-null  float64
 7   Stock Splits  602962 non-null  float64
 8   Company       602962 non-null  object 
dtypes: float64(6), int64(1), object(2)
memory usage: 41.4+ MB


In [11]:
# 날짜를 datetime 으로 변환
stock_df['Date'] = pd.to_datetime(stock_df['Date'], utc=True)
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602962 entries, 0 to 602961
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   Date          602962 non-null  datetime64[ns, UTC]
 1   Open          602962 non-null  float64            
 2   High          602962 non-null  float64            
 3   Low           602962 non-null  float64            
 4   Close         602962 non-null  float64            
 5   Volume        602962 non-null  int64              
 6   Dividends     602962 non-null  float64            
 7   Stock Splits  602962 non-null  float64            
 8   Company       602962 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(6), int64(1), object(1)
memory usage: 41.4+ MB


In [12]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
stock_df[['Open','High','Low','Close','Volume']] = scaler.fit_transform(
    stock_df[['Open','High','Low','Close','Volume']])
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 05:00:00+00:00,-0.349471,-0.351215,-0.351741,-0.352158,11.666544,0.0,0.0,AAPL
1,2018-11-29 05:00:00+00:00,-0.128197,-0.130229,-0.127776,-0.128718,1.608836,0.0,0.0,MSFT
2,2018-11-29 05:00:00+00:00,-0.311902,-0.311273,-0.309591,-0.309884,1.817349,0.0,0.0,GOOGL
3,2018-11-29 05:00:00+00:00,-0.20452,-0.205568,-0.204711,-0.204796,9.146559,0.0,0.0,AMZN
4,2018-11-29 05:00:00+00:00,-0.364493,-0.36483,-0.366101,-0.366844,3.548189,0.04,0.0,NVDA


In [16]:
# 시계열 데이터 생성
import numpy as np
import torch
sequence_length = 5 # 5일 단위

from tqdm import tqdm

def create_sequences(data, seq_length):
    xs = []
    ys = []
    # for i in range(len(data) - seq_length):
    for i in tqdm(range(len(data) - seq_length), desc='Generating Sequences'):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys) # featurs, label

In [18]:
features, label = create_sequences(stock_df['Close'], sequence_length)
features.shape, label.shape

Generating Sequences: 100%|██████████| 602957/602957 [00:20<00:00, 28731.49it/s]


((602957, 5), (602957,))

In [20]:
# to tensor 
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(-1)
label_tensor = torch.tensor(label, dtype=torch.float32).unsqueeze(-1)
features_tensor.shape, label_tensor.shape

(torch.Size([602957, 5, 1]), torch.Size([602957, 1]))

### 모델 학습 

In [21]:
# RNN 모델 정의
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) # batch_first epoch에 대한 batch size 
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [23]:
input_size = 1 # row 단위 갯수
hidden_size = 5 # RNN의 은닉층 갯수
output_size = 1 # 최종 예측값
model = SimpleRNN(input_size,hidden_size,output_size)
model

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [26]:
# 모델 학습
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

epochs = 100

for epoch in tqdm(range(epochs),desc = 'learning Model'):
    model.train()
    outputs = model(features_tensor)
    optimizer.zero_grad()
    loss = criterion(outputs, label_tensor) # error 율 확인 
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 :
        print(f'Epoch [{epoch} / {epochs}] : loss {loss.item():.4f}')
    

learning Model:   1%|          | 1/100 [00:00<01:04,  1.53it/s]

Epoch [0 / 100] : loss 1.1011


learning Model:  11%|█         | 11/100 [00:03<00:28,  3.09it/s]

Epoch [10 / 100] : loss 1.0184


learning Model:  21%|██        | 21/100 [00:06<00:18,  4.20it/s]

Epoch [20 / 100] : loss 1.0031


learning Model:  31%|███       | 31/100 [00:09<00:18,  3.78it/s]

Epoch [30 / 100] : loss 1.0003


learning Model:  41%|████      | 41/100 [00:12<00:16,  3.60it/s]

Epoch [40 / 100] : loss 0.9996


learning Model:  51%|█████     | 51/100 [00:14<00:13,  3.74it/s]

Epoch [50 / 100] : loss 0.9988


learning Model:  61%|██████    | 61/100 [00:17<00:11,  3.38it/s]

Epoch [60 / 100] : loss 0.9969


learning Model:  71%|███████   | 71/100 [00:20<00:09,  3.18it/s]

Epoch [70 / 100] : loss 0.9942


learning Model:  81%|████████  | 81/100 [00:23<00:04,  3.91it/s]

Epoch [80 / 100] : loss 0.9912


learning Model:  91%|█████████ | 91/100 [00:26<00:02,  3.62it/s]

Epoch [90 / 100] : loss 0.9888


learning Model: 100%|██████████| 100/100 [00:28<00:00,  3.45it/s]


In [27]:
model.state_dict()

OrderedDict([('rnn.weight_ih_l0',
              tensor([[ 0.0587],
                      [-0.7942],
                      [-0.0992],
                      [ 0.3269],
                      [-0.6788]])),
             ('rnn.weight_hh_l0',
              tensor([[-0.2565, -0.2472,  0.1901, -0.1152, -0.2109],
                      [-0.0104,  0.2416,  0.3131,  0.5152,  0.1434],
                      [ 0.1107, -0.2041, -0.1362, -0.1480, -0.4522],
                      [-0.1981,  0.3356,  0.2375,  0.3218, -0.3265],
                      [ 0.2591,  0.4350, -0.2074, -0.1003, -0.0229]])),
             ('rnn.bias_ih_l0',
              tensor([ 0.2949,  0.0353, -0.3336, -0.1935, -0.0683])),
             ('rnn.bias_hh_l0',
              tensor([ 0.4835,  0.1470, -0.2498, -0.4018,  0.1587])),
             ('fc.weight',
              tensor([[-0.1999, -0.0868,  0.4812, -0.0701, -0.2487]])),
             ('fc.bias', tensor([0.2912]))])

### 모델 평가