In [1]:
# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import chi2,r_regression
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import yfinance as yf
import talib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.layers import LSTM,Dropout, BatchNormalization, Dense, Conv1D, MaxPool1D
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
from keras.models import Sequential
from sklearn.ensemble import RandomForestRegressor
from keras.optimizer_v2.adam import Adam

In [2]:
# Get the data for the SPY ETF by specifying the stock ticker, start date, and end date
data = yf.download('0005.hk',"2017-09-28","2021-09-24")
data.drop("Adj Close",axis=1,inplace=True)
data.tail(15)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-09-02,41.299999,41.549999,41.049999,41.349998,20502999
2021-09-03,41.150002,41.799999,41.150002,41.450001,29400411
2021-09-06,41.450001,41.700001,41.25,41.400002,24207987
2021-09-07,41.299999,41.400002,41.0,41.0,23797121
2021-09-08,40.5,40.849998,40.299999,40.450001,21636170
2021-09-09,40.700001,41.0,40.25,40.400002,16569193
2021-09-10,40.349998,40.950001,40.25,40.900002,7986921
2021-09-13,40.5,40.849998,40.150002,40.700001,8576584
2021-09-14,40.700001,41.099998,40.099998,40.200001,19188693
2021-09-15,40.599998,40.599998,40.0,40.099998,14418368


In [3]:
data['H-L'] = data['High'] - data['Low']
data['O-C'] = data['Close'] - data['Open']
# data=pd.merge(data,data["Close"].pct_change(),left_index=True,right_index=True)
data["% Change"]=data["Close"].pct_change(7)
data['7day MA'] = data['Close'].shift(1).rolling(window = 7).mean()
data['14day MA'] = data['Close'].shift(1).rolling(window = 14).mean()
data['21day MA'] = data['Close'].shift(1).rolling(window = 21).mean()
data['Std_dev']= data['Close'].shift(1).rolling(7).std()
data['Predictions']=data['Close'].shift(-1)
# data['RSI'] = talib.RSI(data['Close'].values, timeperiod = 7)
# data['Williams %R'] = talib.WILLR(data['High'].values, data['Low'].values, data['Close'].values, 7)
data.dropna(inplace=True)
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,H-L,O-C,% Change,7day MA,14day MA,21day MA,Std_dev,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-10-31,76.250000,76.599998,76.050003,76.199997,32077681,0.549995,-0.050003,-0.003922,76.721428,76.935714,77.076190,0.237797,76.650002
2017-11-01,76.199997,76.800003,76.199997,76.650002,19417713,0.600006,0.450005,0.001961,76.678570,76.800000,77.076190,0.302569,75.849998
2017-11-02,76.150002,76.199997,75.800003,75.849998,26844827,0.399994,-0.300003,-0.008497,76.699999,76.753571,77.080952,0.292974,75.599998
2017-11-03,75.550003,75.849998,75.500000,75.599998,20538341,0.349998,0.049995,-0.014342,76.607142,76.657143,77.004761,0.435344,75.300003
2017-11-06,75.500000,75.500000,74.849998,75.300003,23272416,0.650002,-0.199997,-0.020806,76.449999,76.535714,76.897619,0.573004,75.800003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-15,40.599998,40.599998,40.000000,40.099998,14418368,0.599998,-0.500000,-0.031401,40.721430,41.207144,41.652381,0.412166,40.000000
2021-09-16,40.599998,40.599998,39.599998,40.000000,19553563,1.000000,-0.599998,-0.024390,40.535715,41.060715,41.473810,0.342436,40.250000
2021-09-17,39.700001,40.400002,39.099998,40.250000,28266370,1.300003,0.549999,-0.004944,40.392858,40.917858,41.311905,0.324588,39.000000
2021-09-20,39.900002,39.950001,38.950001,39.000000,23796781,1.000000,-0.900002,-0.034654,40.364286,40.817857,41.211905,0.327509,38.799999


In [4]:
def trim_dataset(mat, batch_size):

    #trims dataset to a size that's divisible by BATCH_SIZE
    no_of_rows_drop = mat.shape[0] % batch_size

    if no_of_rows_drop > 0:
        return mat[:-no_of_rows_drop]
    else:
        return mat

def build_timeseries(mat, target):

    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]

    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))

    print("Length of inputs", dim_0)

    for i in range(dim_0):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = target[TIME_STEPS+i]

    print("length of time-series - inputs", x.shape)
    print("length of time-series - outputs", y.shape)

    return x, y

train_cols = ["Close","H-L","O-C","7day MA","14day MA","21day MA","% Change"]
params = {
    "batch_size": 20,  # 20<16<10, 25 was a bust
    "epochs": 100,
    "lr": 0.00010000,
    "time_steps": 10
}
TIME_STEPS = params["time_steps"]
BATCH_SIZE = params["batch_size"]
df_train, df_test = train_test_split(data, train_size=0.8, test_size=0.2, shuffle=False)
print("Train--Test size", len(df_train), len(df_test))
x = df_train.loc[:,train_cols].values
y = df_train.loc[:,["Predictions"]].values
min_max_scaler = MinMaxScaler()
min_max_scaler_y = MinMaxScaler()
x_train = min_max_scaler.fit_transform(x)
y=min_max_scaler_y.fit_transform(y)
x_test = min_max_scaler.transform(df_test.loc[:,train_cols])
y_test = min_max_scaler_y.transform(df_test.loc[:,["Predictions"]])
# y_test=min_max_scaler_y.fit_transform(y_test)
x_t, y_t = build_timeseries(x_train, y)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
# print(x_test.shape)
print("Batch trimmed size", x_t.shape, y_t.shape)

Train--Test size 768 193
Length of inputs 758
length of time-series - inputs (758, 10, 7)
length of time-series - outputs (758,)
Batch trimmed size (740, 10, 7) (740,)




In [5]:
x_temp, y_temp = build_timeseries(x_test,y_test)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

Length of inputs 183
length of time-series - inputs (183, 10, 7)
length of time-series - outputs (183,)


In [6]:
model=Sequential()
model.add(Conv1D(32,kernel_size=1,input_shape=(x_t.shape[1],x_t.shape[2])))
model.add(MaxPool1D(1))
model.add(LSTM(64,activation='relu'))
model.add(Dense(1))
model.compile(
    loss="mean_squared_error",
    optimizer='Adam'
)

Metal device set to: Apple M1 Pro


2022-01-01 00:33:57.360740: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-01 00:33:57.361364: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [7]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 10, 32)            256       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 10, 32)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 25,153
Trainable params: 25,153
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=40, min_delta=0)
history=model.fit(x_t,y_t,epochs=300,validation_data=(x_val,y_val),shuffle=False,callbacks=es)

Epoch 1/300


2022-01-01 00:33:57.593257: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-01-01 00:33:57.977651: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred = y_pred.flatten()
y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
error = mean_squared_error(y_test_t, y_pred)
print("Error is", error, y_pred.shape, y_test_t.shape)
print(y_pred[0:15])
# convert the predicted value to range of real data
y_pred_org = min_max_scaler_y.inverse_transform(y_pred.reshape(-1,1))
# min_max_scaler.inverse_transform(y_pred)
print(y_test_t[0:15])

y_test_t_org = min_max_scaler_y.inverse_transform(y_test_t.reshape(-1,1))
# min_max_scaler.inverse_transform(y_test_t)
print(y_pred_org[0:15])
print(y_test_t_org[0:15])

In [None]:
plt.figure()
plt.plot(y_pred_org)
plt.plot(y_test_t_org)
plt.title('Prediction vs Real Stock Price')
plt.ylabel('Price')
plt.xlabel('Days')
plt.legend(['Prediction', 'Real'], loc='upper left')