In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys
import tensorflow as tf

from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [2]:
# 예측할 종목은 한양증권(001750) 입니다
# 기간은 00-01-04 ~ 21-02-05 입니다
# Yahoo Finanace에서 다운로드 받아 null값을 제거한 데이터를 사용합니다

df = pd.read_csv('C:\Jupyter_Project\Hanyang_Securities_F.csv')
df = df.dropna()
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2000-01-04,6300,7100,6300,7000,1619.266357,56800
1,2000-01-05,6700,7100,6610,6700,1549.868774,52100
2,2000-01-06,7000,7000,6250,6300,1457.339844,64900
3,2000-01-07,6350,6600,6300,6370,1473.532349,61800
4,2000-01-10,6610,6700,6300,6500,1503.603882,56100


In [3]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5181,2021-02-01,9200,9480,9100,9380,9380.0,81355
5182,2021-02-02,9460,9810,9460,9700,9700.0,105755
5183,2021-02-03,9850,10200,9800,9990,9990.0,170966
5184,2021-02-04,10100,10200,9940,10150,10150.0,133504
5185,2021-02-05,10200,10800,10150,10650,10650.0,247224


In [4]:
# OHLC를 Adj OHLC로 바꾸기 위한 비율입니다
# Adj OHLC는 과거의 절대가격을 현재 가격의 시점으로 보기위한 수정된 가격입니다
# 과거 발생한 액면분할과 현금배당을 반영한 Adj Close를 기준으로 조정합니다

ratio = df['Adj Close']/df['Close']
ratio

0       0.231324
1       0.231324
2       0.231324
3       0.231324
4       0.231324
          ...   
5181    1.000000
5182    1.000000
5183    1.000000
5184    1.000000
5185    1.000000
Length: 5186, dtype: float64

In [5]:
df['Adj Open'] = df['Open']*ratio
df['Adj High'] = df['High']*ratio
df['Adj Low'] = df['Low']*ratio

In [6]:
df.drop(['Open','High','Low','Close'], axis=1, inplace=True)

In [7]:
df.rename(columns={'Adj Open':'Open', 'Adj High':'High', 'Adj Low':'Low', 'Adj Close':'Close'}, inplace=True)

In [8]:
df = df[['Open', 'High', 'Low', 'Close', 'Volume']]

df

Unnamed: 0,Open,High,Low,Close,Volume
0,1457.339721,1642.398734,1457.339721,1619.266357,56800
1,1549.868774,1642.398253,1529.049641,1549.868774,52100
2,1619.266493,1619.266493,1445.773655,1457.339844,64900
3,1468.905874,1526.736814,1457.339686,1473.532349,61800
4,1529.049486,1549.868617,1457.339147,1503.603882,56100
...,...,...,...,...,...
5181,9200.000000,9480.000000,9100.000000,9380.000000,81355
5182,9460.000000,9810.000000,9460.000000,9700.000000,105755
5183,9850.000000,10200.000000,9800.000000,9990.000000,170966
5184,10100.000000,10200.000000,9940.000000,10150.000000,133504


In [9]:
# MinMaxScaler 클래스를 사용하여 데이터를 스케일링 합니다.
# MinMaxScaler는 데이터의 최대값이 1, 최소값이 0이 되도록 변환합니다

scaler = MinMaxScaler()
scale_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
df_scaled = scaler.fit_transform(df[scale_cols])

# 정규화가 완료된 데이터들은 pandas dataframe으로 변환합니다
# pandas는 시계열 자료에 대한 다양한 기능을 제공하여 LSTM에서 사용하는 window를 만들때 유용합니다

df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scale_cols

print(df_scaled)

          Open      High       Low     Close    Volume
0     0.069093  0.078420  0.072692  0.082280  0.020301
1     0.076891  0.078420  0.078924  0.076473  0.018620
2     0.082740  0.076587  0.071686  0.068730  0.023197
3     0.070068  0.069256  0.072692  0.070085  0.022088
4     0.075136  0.071089  0.072692  0.072601  0.020050
...        ...       ...       ...       ...       ...
5181  0.721622  0.699387  0.736878  0.731697  0.029080
5182  0.743534  0.725532  0.768164  0.758474  0.037804
5183  0.776402  0.756432  0.797711  0.782742  0.061119
5184  0.797472  0.756432  0.809878  0.796130  0.047725
5185  0.805899  0.803969  0.828128  0.837970  0.088383

[5186 rows x 5 columns]


In [16]:
feature_cols = ['Open', 'High', 'Low', 'Volume']
label_cols = ['Close']

data = df_scaled[feature_cols]
target = df_scaled[label_cols]

data.shape, target.shape

((5186, 4), (5186, 1))

In [25]:
from sklearn.metrics import mean_squared_error

def calc_train_error(x_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    prediction = model.predict(x_train)
    mse = mean_squared_error(y_train, prediction)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(x_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    prediction = model.predict(x_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(x_train, y_train, x_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(x_train, y_train)
    train_error = calc_train_error(x_train, y_train, model)
    validation_error = calc_validation_error(x_test, y_test, model)
    return train_error, validation_error

In [26]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, 
                                                    target, 
                                                    shuffle=True,
                                                    test_size=0.2, 
                                                    random_state=15)

In [27]:
# train/validation split (gives us train and validation sets)
x_train, x_validation, y_train, y_validation = train_test_split(x_train,
                                                                y_train,
                                                                shuffle=False,
                                                                test_size=0.25,
                                                                random_state=2018)

In [28]:
# print proportions
print('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(target),2),
                                                       round(len(y_validation)/len(target),2),
                                                       round(len(y_test)/len(target),2)))

train: 0.6% | validation: 0.2% | test 0.2%


In [36]:
from sklearn.linear_model import Ridge, LinearRegression

alphas = [0.0001, 0.001, 0.01, 0.05, 0.1, 1, 10]
print('All errors are RMSE')
print('-'*76)
for alpha in alphas:
    # instantiate and fit model
    ridge = Ridge(alpha=alpha, fit_intercept=True, random_state=99)
    ridge.fit(x_train, y_train)

    # calculate errors
    new_train_error = mean_squared_error(y_train, ridge.predict(x_train))
    new_validation_error = mean_squared_error(y_validation, ridge.predict(x_validation))
    new_test_error = mean_squared_error(y_test, ridge.predict(x_test))
    
    # print errors as report
    print('alpha: {:7} | train error: {:5} | val error: {:6} | test error: {}'.
          format(alpha,
                 round(new_train_error,10),
                 round(new_validation_error,10),
                 round(new_test_error,10)))

All errors are RMSE
----------------------------------------------------------------------------
alpha:  0.0001 | train error: 2.48077e-05 | val error: 2.86232e-05 | test error: 3.02e-05
alpha:   0.001 | train error: 2.48108e-05 | val error: 2.85639e-05 | test error: 3.02684e-05
alpha:    0.01 | train error: 2.50627e-05 | val error: 2.82646e-05 | test error: 3.1134e-05
alpha:    0.05 | train error: 2.80965e-05 | val error: 2.97657e-05 | test error: 3.61573e-05
alpha:     0.1 | train error: 3.20287e-05 | val error: 3.26877e-05 | test error: 4.15391e-05
alpha:       1 | train error: 4.78429e-05 | val error: 4.5947e-05 | test error: 6.14102e-05
alpha:      10 | train error: 7.56015e-05 | val error: 7.38593e-05 | test error: 9.70885e-05


In [38]:
# instantiate model
ridge = Ridge(alpha=0.11, fit_intercept=True, random_state=99)

# fit and calculate errors
new_train_error, new_test_error = calc_metrics(x_train, y_train, x_test, y_test, ridge)
new_train_error, new_test_error = round(new_train_error, 3), round(new_test_error, 3)

In [41]:
lr = LinearRegression(fit_intercept=True)

train_error, test_error = calc_metrics(x_train, y_train, x_test, y_test, lr)
train_error, test_error = round(train_error, 3), round(test_error, 3)

print('train error: {} | test error: {}'.format(train_error, test_error))
print('train/test: {}'.format(round(test_error/train_error, 1)))

train error: 0.0 | test error: 0.0
train/test: nan


  print('train/test: {}'.format(round(test_error/train_error, 1)))


In [42]:
print('ORIGINAL ERROR')
print('-' * 40)
print('train error: {} | test error: {}\n'.format(train_error, test_error))
print('ERROR w/REGULARIZATION')
print('-' * 40)
print('train error: {} | test error: {}'.format(new_train_error, new_test_error))

ORIGINAL ERROR
----------------------------------------
train error: 0.0 | test error: 0.0

ERROR w/REGULARIZATION
----------------------------------------
train error: 0.0 | test error: 0.0
