# 삼성전자 주가 데이터 다운로드
- Yahoo Finance 에서 주가 데이터 다운로드 (https://finance.yahoo.com/)
    - 검색 키워드 '005930.KS' 입력
- 검색 후 Historical Data 선택

![yahoo finance](figures/rnn/21_yahoo_stock1.png)

- `Start Date: 2000년 1월 4일 End Date: 오늘날짜` 선택
- 다운로드
  
![yahoo finance](figures/rnn/22_yahoo_stock2.png)

In [2]:
pip install torchinfo

Collecting torchinfo
  Using cached torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Using cached torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchinfo

from sklearn.preprocessing import MinMaxScaler, StandardScaler # MM: y, SS: x
from sklearn.model_selection import train_test_split  

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# DataLoading

In [28]:
df = pd.read_csv("datasets/005930.KS.csv")
df.shape

(6122, 7)

In [29]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2000-01-04,6000.0,6110.0,5660.0,6110.0,4449.711426,74195000
1,2000-01-05,5800.0,6060.0,5520.0,5580.0,4063.72876,74680000
2,2000-01-06,5750.0,5780.0,5580.0,5620.0,4092.860107,54390000
3,2000-01-07,5560.0,5670.0,5360.0,5540.0,4034.597656,40305000
4,2000-01-10,5600.0,5770.0,5580.0,5770.0,4202.100586,46880000


In [30]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
6117,2024-05-30,74800.0,75200.0,73500.0,73500.0,73500.0,28551273
6118,2024-05-31,74500.0,74700.0,73500.0,73500.0,73500.0,26198776
6119,2024-06-03,74400.0,76400.0,74200.0,75700.0,75700.0,15706268
6120,2024-06-04,74900.0,76100.0,74900.0,75300.0,75300.0,14098053
6121,2024-06-05,78000.0,78000.0,76800.0,77400.0,77400.0,23177968


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6122 entries, 0 to 6121
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       6122 non-null   object 
 1   Open       6122 non-null   float64
 2   High       6122 non-null   float64
 3   Low        6122 non-null   float64
 4   Close      6122 non-null   float64
 5   Adj Close  6122 non-null   float64
 6   Volume     6122 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 334.9+ KB


In [32]:
### 시계열 데이터 -> 날짜(일시)를 index로 만든다.
df["Date"] = pd.to_datetime(df["Date"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6122 entries, 0 to 6121
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6122 non-null   datetime64[ns]
 1   Open       6122 non-null   float64       
 2   High       6122 non-null   float64       
 3   Low        6122 non-null   float64       
 4   Close      6122 non-null   float64       
 5   Adj Close  6122 non-null   float64       
 6   Volume     6122 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 334.9 KB


In [34]:
df.set_index("Date", inplace=True)  # 컬럼->index name

In [35]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,4449.711426,74195000
2000-01-05,5800.0,6060.0,5520.0,5580.0,4063.72876,74680000
2000-01-06,5750.0,5780.0,5580.0,5620.0,4092.860107,54390000
2000-01-07,5560.0,5670.0,5360.0,5540.0,4034.597656,40305000
2000-01-10,5600.0,5770.0,5580.0,5770.0,4202.100586,46880000


In [36]:
# 결측치 확인
df.isna().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [37]:
# Adj Close 컬럼을 제거 (Close와 같은 값을 표현하는 컬럼이므로 제거.)
df.drop(columns="Adj Close", inplace=True)

In [38]:
df.head(52)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,74195000
2000-01-05,5800.0,6060.0,5520.0,5580.0,74680000
2000-01-06,5750.0,5780.0,5580.0,5620.0,54390000
2000-01-07,5560.0,5670.0,5360.0,5540.0,40305000
2000-01-10,5600.0,5770.0,5580.0,5770.0,46880000
2000-01-11,5820.0,6100.0,5770.0,5770.0,59745000
2000-01-12,5610.0,5740.0,5600.0,5720.0,29220000
2000-01-13,5600.0,5740.0,5560.0,5710.0,41190000
2000-01-14,5720.0,5880.0,5680.0,5830.0,49375000
2000-01-17,6000.0,6180.0,5920.0,6100.0,63505000


# Dataset 구성
## input, output data
- input (X)) feature 구성: \[Open, High, Low, Close, Volumn  (Adj Close 제외)\] 50일치
- output (y) : Close - input 다음날 Close가격

In [39]:
# X(input), y(output) 분리
X_df = df
y_df = df['Close'].to_frame()  # Series -> DataFrame
X_df.shape, y_df.shape

((6122, 5), (6122, 1))

In [40]:
X_df.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,74195000
2000-01-05,5800.0,6060.0,5520.0,5580.0,74680000
2000-01-06,5750.0,5780.0,5580.0,5620.0,54390000


In [41]:
y_df.head(3)

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2000-01-04,6110.0
2000-01-05,5580.0
2000-01-06,5620.0


## 전처리
- feature scaling
    - feature 간의 scaling(단위)을 맞추는 작업.
- X: Standard Scaling (평균: 0, 표준편차: 1)
- y: MinMax Scaling (최소: 0, 최대: 1)  => X의 scale과 비슷한 값으로 변환.

In [42]:
x_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X = x_scaler.fit_transform(X_df) # 입력: DataFrame -> 출력: ndarray
y = y_scaler.fit_transform(y_df)
X.shape, y.shape

((6122, 5), (6122, 1))

In [43]:
X[:5]

array([[-0.99007851, -0.98965518, -1.00112858, -0.98571511,  3.51992614],
       [-0.99894601, -0.99185497, -1.00738983, -1.00923676,  3.55212512],
       [-1.00116289, -1.00417382, -1.00470644, -1.00746154,  2.20507915],
       [-1.00958702, -1.00901336, -1.01454554, -1.01101197,  1.26998095],
       [-1.00781352, -1.00461378, -1.00470644, -1.00080447,  1.70649289]])

In [44]:
y[:5]

array([[0.03829161],
       [0.0322873 ],
       [0.03274046],
       [0.03183415],
       [0.03443979]])

## Input Sequential Data 구성
- X: 50일치 데이터(ex:1일 ~ 50일), y: 51일째 주가. (ex: 51일)
    - 50일의 연속된 주식데이터를 학습하여 51일째 주가를 예측한다.
    - X의 한개의 데이터가 50일치 주가데이터가 된다.

![img](figures/rnn/20_stock_dataset.png)

[연속된 날짜가 5인 경우]

In [47]:
y.size - 50

6072

In [49]:
time_steps = 50  # seq_length. 몇일 치를 하나로 묶어서 X(input)을 만들지.
data_X = []   #input data들을 모을 리스트. X 1개 shape [time_steps, 5]
data_y = []   # output data을 모을 리스트

for idx in range(0, y.size - time_steps): # 행이 51행 이 남을 때까지 반복.
    # X1: 0 ~ 50-1, y1: 50
    # X2: 1 ~ 51-1, y2: 51
    # X3: 2 ~ 52-1, y3: 52
    ### idx: 시작 index 
    _X = X[idx : time_steps+idx]
    _y = y[time_steps+idx]
    data_X.append(_X)
    data_y.append(_y)

In [52]:
print(np.shape(data_X))  # (data개수-batch,  seq_length,  features수-input_size)
np.shape(data_y)

((6072, 50, 5), (6072, 1))

In [53]:
data_X[0]

array([[-0.99007851, -0.98965518, -1.00112858, -0.98571511,  3.51992614],
       [-0.99894601, -0.99185497, -1.00738983, -1.00923676,  3.55212512],
       [-1.00116289, -1.00417382, -1.00470644, -1.00746154,  2.20507915],
       [-1.00958702, -1.00901336, -1.01454554, -1.01101197,  1.26998095],
       [-1.00781352, -1.00461378, -1.00470644, -1.00080447,  1.70649289],
       [-0.99805926, -0.99009514, -0.99620903, -1.00080447,  2.56059572],
       [-1.00737014, -1.00593365, -1.00381197, -1.00302349,  0.5340517 ],
       [-1.00781352, -1.00593365, -1.0056009 , -1.00346729,  1.32873579],
       [-1.00249302, -0.99977423, -1.00023412, -0.99814164,  1.87213506],
       [-0.99007851, -0.98657547, -0.98950056, -0.98615891,  2.8102208 ],
       [-0.98298451, -0.98745538, -0.98681716, -0.98615891,  1.59894166],
       [-0.99007851, -0.99273489, -0.98771163, -0.99237218,  1.26334199],
       [-0.99628576, -0.99273489, -0.99397287, -0.98882174,  1.0714759 ],
       [-0.99229539, -0.99537464, -0.9

In [58]:
data_y[0]
# y_scaler.inverse_transform([data_y[0]])

array([0.02809562])

## Train / test set 분리

In [62]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)

In [64]:
np.shape(X_train), np.shape(X_test)

((4857, 50, 5), (1215, 50, 5))

## Dataset, DataLoader 구성

In [65]:
# 메모리있는 torch.Tensor를 Dataset으로 만들때 -> TensorDataset
### 1. ndarray -> torch.Tensor
X_train_tensor = torch.tensor(np.array(X_train), dtype=torch.float32)
X_test_tensor = torch.tensor(np.array(X_test), dtype=torch.float32)
y_train_tensor = torch.tensor(np.array(y_train), dtype=torch.float32)
y_test_tensor = torch.tensor(np.array(y_test), dtype=torch.float32)

In [66]:
#Dataset 생성
train_set = TensorDataset(X_train_tensor, y_train_tensor)
test_set = TensorDataset(X_test_tensor, y_test_tensor)

# 데이터개수 확인
len(train_set), len(test_set)

(4857, 1215)

In [67]:
# DataLoader 생성
train_loader = DataLoader(train_set, batch_size=200, shuffle=True, drop_last=True)
test_loader = DataLoader(test_set, batch_size=200)

# 에폭당 step수
len(train_loader), len(test_loader)

(24, 7)

In [92]:
################# 테스트 ##################
x, y = next(iter(train_loader))  # 한 배치 조회
print(x.shape, y.shape)

torch.Size([200, 50, 5]) torch.Size([200, 1])


In [93]:
### LSTM Layer 에 입력
a = nn.LSTM(input_size=5, hidden_size=10, num_layers=1, bidirectional=True) #batch_first=False : 입력 (seq_len, batch, feature)
x = x.permute(1, 0, 2)
o1, (h1, c1) = a(x)

In [82]:
print(o1.shape)  # 200->50: seq_length, 50->200: batch, 10: hidden_state개수

torch.Size([50, 200, 20])


In [83]:
print(h1.shape) # 마지막 시점(time step) hidden state (1*양방향(2)*layer수, batch, hidden_size)

torch.Size([2, 200, 10])


In [84]:
print(c1.shape)

torch.Size([2, 200, 10])


In [96]:
l = nn.Linear(in_features=20, out_features=1)  
l(o1[-1:,:])

tensor([[[ 1.1334e-01],
         [ 1.6159e-02],
         [ 9.9562e-03],
         [ 7.1878e-03],
         [ 4.6831e-02],
         [-1.4433e-02],
         [-9.2685e-05],
         [-7.2210e-03],
         [-1.2359e-02],
         [-9.2408e-03],
         [ 1.2247e-01],
         [-7.9339e-03],
         [ 6.1760e-02],
         [ 4.5215e-02],
         [ 5.2282e-02],
         [-1.9956e-04],
         [ 2.1049e-04],
         [ 8.4394e-02],
         [-4.0778e-03],
         [ 4.0183e-02],
         [ 1.2405e-01],
         [ 1.0306e-01],
         [ 2.0993e-02],
         [ 9.4145e-02],
         [ 4.0804e-02],
         [ 9.4189e-02],
         [-1.6689e-02],
         [-8.9063e-03],
         [ 3.7374e-02],
         [-1.7950e-02],
         [ 1.0662e-01],
         [ 3.6316e-02],
         [-8.7883e-03],
         [ 1.1946e-01],
         [ 1.1044e-01],
         [ 9.1873e-02],
         [-5.1382e-02],
         [ 5.0368e-02],
         [ 1.6995e-02],
         [ 9.8671e-03],
         [-5.8143e-03],
         [ 1.114

# 모델 정의
LSTM => Linear(추론기) => output(다음날주식가격)

In [98]:
class StockPriceModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, 
                         bidirectional=False, dropout_rate=0.0):
        super().__init__()
        ## LSTM, Linear(주가 예측 출력 Layer함수)
        """
        input_size: LSTM input_size, 
        hidden_size: LSTM hidden_size
        num_layers: LSTM stack layer 개수
        bidirectional: 양방향 여부
        dropout_rate: LSTM의 dropout 비율.
        """
        self.lstm = nn.LSTM(
            input_size=input_size,     # X: [seq_len,  batch, input_size]
            hidden_size=hidden_size, # 출력값의 shape과 관련.
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout_rate
        )
        # o, (h, c) = lstm(x)   # 마지막 시점 처리결과 => Linear
        # o의 마지막 time step 것을 사용. o[-1, :, :] => [batch, hidden * 단:1, 양:2]
        D = 2 if bidirectional else 1
        self.dropout = nn.Dropout(dropout_rate)
        self.output = nn.Linear(in_features=hidden_size * D, 
                                     out_features=1)  # 출력: 주식가격 1개.
        self.sigmoid = nn.Sigmoid()  # y가 0 ~ 1(minmax scaling) 범위이므로 sigmoid로 범위를 맞춘다.

    def forward(self, X):
        # Layer 흐름 처리.
        # X -> lstm -마지막 시점(timestep) 처리결과-> output
        # X shape: (batch_size, seq_len, input_size) -lstm입력 shape변경->bach_size<->seq_len
        X = X.permute(1, 0, 2)
        out, (hidden, cell) = self.lstm(X)  # out: [seq_len, batch_size, hidden * D]
        last_out = out[-1, :, :] # (50, 200, hidden*2)  -> (200, hidden*2)
        last_out = self.dropout(last_out)
        last_out = self.output(last_out)  # W*last_out + b
        last_out = self.sigmoid(last_out)  # 0 ~ 1 값 1개 
        return last_out

In [99]:
sample_model = StockPriceModel(5, 50, 2, True, 0.2)
a, b = next(iter(train_loader))
c = sample_model(a)
c.shape

torch.Size([200, 1])

In [100]:
# torch.square(c - b).mean()

In [103]:
torchinfo.summary(sample_model, (200, 50, 5))

Layer (type:depth-idx)                   Output Shape              Param #
StockPriceModel                          [200, 1]                  --
├─LSTM: 1-1                              [50, 200, 100]            83,600
├─Dropout: 1-2                           [200, 100]                --
├─Linear: 1-3                            [200, 1]                  101
├─Sigmoid: 1-4                           [200, 1]                  --
Total params: 83,701
Trainable params: 83,701
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 836.02
Input size (MB): 0.20
Forward/backward pass size (MB): 8.00
Params size (MB): 0.33
Estimated Total Size (MB): 8.54

### train

In [106]:
# 하이퍼파라미터
EPOCHS = 100
LR = 0.0001

INPUT_SIZE = 5
HIDDEN_SIZE = 30
NUM_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT_RATE = 0.2
# 모델 생성
model = StockPriceModel(
    input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS, bidirectional=BIDIRECTIONAL, 
    dropout_rate=DROPOUT_RATE
)
model = model.to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 회귀: RMSprop()
# loss 함수 - 회귀 : MSE
loss_fn = nn.MSELoss()


In [107]:
##### 학습 (train)
train_loss_list = []
test_loss_list = []
for epoch in range(EPOCHS):
    ########## 학습 ##########
    model.train()
    train_loss = 0.0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    train_loss = train_loss / len(train_loader)
    train_loss_list.append(train_loss)  # 한 epoch train loss저장.

    ##########검증################
    model.eval()
    test_loss = 0.0
    for X_test, y_test in test_loader:
        X_test, y_test = X_test.to(device), y_test.to(device)
        with torch.no_grad():
            pred_test = model(X_test)
            loss_test = loss_fn(pred_test, y_test)
            test_loss += loss_test.item()
    test_loss = test_loss / len(test_loader)
    test_loss_list.append(test_loss)
    ## 1 에폭 학습+검증 완료 -> 로그 출력
    if epoch % 10 == 0 or epoch == EPOCHS-1: # 10번 한번, 마지막 epoch
        print(f"[{epoch+1}/{EPOCHS}] train loss: {train_loss}, validation loss: {test_loss}")

[1/100] train loss: 0.10242029745131731, validation loss: 0.10089500567742757
[11/100] train loss: 0.034974180006732546, validation loss: 0.0288802015462092
[21/100] train loss: 0.00632212085959812, validation loss: 0.005394279358110258
[31/100] train loss: 0.0033581403355735042, validation loss: 0.0026848701859957407
[41/100] train loss: 0.0017574866263506312, validation loss: 0.0012316480112661207
[51/100] train loss: 0.0011382010125089437, validation loss: 0.000668877174445827
[61/100] train loss: 0.0008564630552427843, validation loss: 0.0004524742066028661
[71/100] train loss: 0.0007322882399118195, validation loss: 0.0003251341903316123
[81/100] train loss: 0.000617283799025851, validation loss: 0.0002533595647297001
[91/100] train loss: 0.000574664900341304, validation loss: 0.00021843561574184735
[100/100] train loss: 0.0005384110445447732, validation loss: 0.00019269799564166794


# 마지막 데이터로 다음날 주식가격 추론

In [110]:
np.shape(data_X)
np.shape(data_X[-1])  # 가장 마지막 seq data ( ~ 2024.6.5) ==> 이용해서 추론.

(50, 5)

In [111]:
type(data_X[-1])

numpy.ndarray

In [None]:
### 50일치 주가 정보를 수집
### Standard Scaling 해서 입력 데이터로 만들어야 한다.

In [114]:
new_X = torch.tensor(data_X[-1], dtype=torch.float32).unsqueeze(dim=0)
new_X.shape

torch.Size([1, 50, 5])

In [115]:
with torch.no_grad():
    pred = model(new_X)
print(pred)

tensor([[0.8424]])


In [116]:
y_scaler.inverse_transform(pred)

array([[77085.48722863]])