## simple RNN
- dataset : massive-yahoo-finance-dataset
- 연속형(시계열, 문장 등) 데이터 처리에 유리

### 데이터 처리

In [1]:
# 데이터 로드
import pandas as pd
file_path = f'/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv'
stock_df_all = pd.read_csv(file_path)
stock_df_all.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL
3,2018-11-29 00:00:00-05:00,83.749496,84.499496,82.616501,83.678497,132264000,0.0,0.0,AMZN
4,2018-11-29 00:00:00-05:00,39.692784,40.064904,38.735195,39.037853,54917200,0.04,0.0,NVDA


In [2]:
selected_company = 'AAPL'
stock_df = stock_df_all[stock_df_all['Company'] == selected_company].copy()
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
458,2018-11-30 00:00:00-05:00,43.261071,43.270671,42.478826,42.850754,158126000,0.0,0.0,AAPL
916,2018-12-03 00:00:00-05:00,44.261681,44.376858,43.481835,44.348064,163210000,0.0,0.0,AAPL
1374,2018-12-04 00:00:00-05:00,43.419445,43.764977,42.296468,42.397247,165377200,0.0,0.0,AAPL
1832,2018-12-06 00:00:00-05:00,41.21428,41.938938,40.892744,41.924541,172393600,0.0,0.0,AAPL


In [3]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1258 entries, 0 to 602471
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          1258 non-null   object 
 1   Open          1258 non-null   float64
 2   High          1258 non-null   float64
 3   Low           1258 non-null   float64
 4   Close         1258 non-null   float64
 5   Volume        1258 non-null   int64  
 6   Dividends     1258 non-null   float64
 7   Stock Splits  1258 non-null   float64
 8   Company       1258 non-null   object 
dtypes: float64(6), int64(1), object(2)
memory usage: 98.3+ KB


In [4]:
# 날짜를 datetime 으로 변환
stock_df['Date'] = pd.to_datetime(stock_df['Date'], utc=True)
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1258 entries, 0 to 602471
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   Date          1258 non-null   datetime64[ns, UTC]
 1   Open          1258 non-null   float64            
 2   High          1258 non-null   float64            
 3   Low           1258 non-null   float64            
 4   Close         1258 non-null   float64            
 5   Volume        1258 non-null   int64              
 6   Dividends     1258 non-null   float64            
 7   Stock Splits  1258 non-null   float64            
 8   Company       1258 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(6), int64(1), object(1)
memory usage: 98.3+ KB


In [5]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
stock_df[['Open','High','Low','Close','Volume']] = scaler.fit_transform(
    stock_df[['Open','High','Low','Close','Volume']])
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 05:00:00+00:00,-1.598199,-1.611253,-1.611061,-1.615618,1.176835,0.0,0.0,AAPL
458,2018-11-30 05:00:00+00:00,-1.61031,-1.623753,-1.614516,-1.620572,1.010309,0.0,0.0,AAPL
916,2018-12-03 05:00:00+00:00,-1.589,-1.600423,-1.592958,-1.588704,1.104861,0.0,0.0,AAPL
1374,2018-12-04 05:00:00+00:00,-1.606937,-1.613328,-1.618436,-1.630224,1.145166,0.0,0.0,AAPL
1832,2018-12-06 05:00:00+00:00,-1.6539,-1.65184,-1.648607,-1.640285,1.275657,0.0,0.0,AAPL


In [6]:
# 시계열 데이터 생성
import numpy as np
import torch
sequence_length = 5 # 5일 단위

from tqdm import tqdm

def create_sequences(data, seq_length):
    xs = []
    ys = []
    # for i in range(len(data) - seq_length):
    for i in tqdm(range(len(data) - seq_length), desc='Generating Sequences'):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys) # featurs, label

In [7]:
features, label = create_sequences(stock_df['Close'].values, sequence_length)
features.shape, label.shape

Generating Sequences: 100%|██████████| 1253/1253 [00:00<00:00, 989915.79it/s]


((1253, 5), (1253,))

In [8]:
# to tensor 
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(-1)
label_tensor = torch.tensor(label, dtype=torch.float32).unsqueeze(-1)
features_tensor.shape, label_tensor.shape

(torch.Size([1253, 5, 1]), torch.Size([1253, 1]))

### 모델 학습 

In [9]:
# RNN 모델 정의
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) # batch_first epoch에 대한 batch size 
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [10]:
input_size = 1 # row 단위 갯수
hidden_size = 5 # RNN의 은닉층 갯수
output_size = 1 # 최종 예측값
model = SimpleRNN(input_size,hidden_size,output_size)
model

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [11]:
# 모델 학습
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

epochs = 100

for epoch in tqdm(range(epochs),desc = 'learning Model'):
    model.train()
    outputs = model(features_tensor)
    optimizer.zero_grad()
    loss = criterion(outputs, label_tensor) # error 율 확인 
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 :
        print(f'Epoch [{epoch} / {epochs}] : loss {loss.item():.4f}')
    

learning Model:  36%|███▌      | 36/100 [00:00<00:00, 149.26it/s]

Epoch [0 / 100] : loss 0.9929
Epoch [10 / 100] : loss 0.2885
Epoch [20 / 100] : loss 0.0649
Epoch [30 / 100] : loss 0.0579
Epoch [40 / 100] : loss 0.0338
Epoch [50 / 100] : loss 0.0256
Epoch [60 / 100] : loss 0.0209


learning Model: 100%|██████████| 100/100 [00:00<00:00, 208.56it/s]

Epoch [70 / 100] : loss 0.0164
Epoch [80 / 100] : loss 0.0129
Epoch [90 / 100] : loss 0.0105





In [12]:
model.state_dict()

OrderedDict([('rnn.weight_ih_l0',
              tensor([[-0.3796],
                      [-0.6286],
                      [-0.2233],
                      [-0.1975],
                      [-0.0118]])),
             ('rnn.weight_hh_l0',
              tensor([[ 0.2179,  0.2530,  0.3912, -0.3470, -0.2947],
                      [ 0.4701,  0.1236, -0.0085, -0.1504, -0.3889],
                      [ 0.0303,  0.2889, -0.1892,  0.2184, -0.3195],
                      [ 0.5114,  0.5527,  0.4162,  0.4732, -0.6567],
                      [ 0.3639,  0.3226, -0.1947,  0.0449,  0.2820]])),
             ('rnn.bias_ih_l0',
              tensor([ 0.2188,  0.4771, -0.1065, -0.1263,  0.2354])),
             ('rnn.bias_hh_l0',
              tensor([-0.3539,  0.1683,  0.2818, -0.0516,  0.5566])),
             ('fc.weight',
              tensor([[-0.3887, -0.7016, -0.2013, -0.5400, -0.2351]])),
             ('fc.bias', tensor([0.0172]))])

### 모델 평가

In [13]:
model.eval()

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [14]:
with torch.no_grad():
    predicted = model(features_tensor[:5])
predicted, label_tensor[:5]

(tensor([[-1.5017],
         [-1.5133],
         [-1.5167],
         [-1.5180],
         [-1.5178]]),
 tensor([[-1.6721],
         [-1.6664],
         [-1.6714],
         [-1.6690],
         [-1.6595]]))