Import 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
import pandas_datareader.data as web
from time import time
from sklearn.model_selection import KFold

Data preparation

In [2]:
# Get Data
# Read stock data use pandas_datareader.data from web
# Get Stock Data

stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo')
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')
# Select columns
base = stk_data.loc[:, ('Adj Close', 'MSFT')]
X1 = stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]
X2 = ccy_data
X3 = idx_data

# Standardize Data
# Standardized data (X1, X2, X3) with kept index (date)
standard_scaler = preprocessing.StandardScaler()
std = standard_scaler.fit_transform(X1.values)
X1 = pd.DataFrame(data=std,index=X1.index,columns=X1.columns)
X1

Attributes,Adj Close,Adj Close
Symbols,GOOGL,IBM
Date,Unnamed: 1_level_2,Unnamed: 2_level_2
2017-09-22,-1.144386,-0.561522
2017-09-25,-1.158605,-0.510042
2017-09-26,-1.153617,-0.462041
2017-09-27,-1.118037,-0.524650
2017-09-28,-1.110262,-0.524650
...,...,...
2022-09-16,0.617595,1.089341
2022-09-19,0.626146,1.131800
2022-09-20,0.565024,0.999809
2022-09-21,0.506119,0.873356


In [3]:
# Calculate ความแตกต่างของค่า ราคา 'Adj Close', 'MSFT’)ย้อนหลัง backHisotry วัน
backHistory = [30, 45, 60, 90, 180, 240] # -> ทดลองหยิบ 3 ค่า 3 รูปแบบ เพื่อดูระยะเวลาการดูค่าข้อมูลย้อนหลงัหลายๆแบบและเปรียบเทียบ MSE
BH1, BH2, BH3 = backHistory[1], backHistory[3], backHistory[4]
return_period = 2
Y = base.shift(-return_period)
X4_BH1 = base.diff( BH1).shift( - BH1)
X4_BH2 = base.diff( BH2).shift( - BH2)
X4_BH3 = base.diff( BH3).shift( - BH3)
X4 = pd.concat([X4_BH1, X4_BH2, X4_BH3], axis=1)
X4.columns = ['MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X4 = pd.DataFrame(standard_scaler.fit_transform(X4.values), index = X4.index,columns=X4.columns)

In [4]:
# Forming Dataset
X = pd.concat([X1, X2, X3, X4], axis=1)
data = pd.concat([Y, X], axis=1)
data

Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-22,68.838737,-1.144386,-0.561522,,,,,,0.121200,0.195597,-0.167643
2017-09-25,69.393135,-1.158605,-0.510042,111.64,1.3457,2496.66,22296.09,10.21,0.238260,0.144334,-0.149099
2017-09-26,69.411926,-1.153617,-0.462041,112.16,1.3422,2496.84,22284.32,10.17,0.154579,-0.000734,-0.133418
2017-09-27,69.994499,-1.118037,-0.524650,112.76,1.3401,2507.04,22340.71,9.87,0.167780,0.104534,-0.184915
2017-09-28,70.107262,-1.110262,-0.524650,112.66,1.3435,2510.06,22381.20,9.55,0.171590,0.037761,-0.165374
...,...,...,...,...,...,...,...,...,...,...,...
2022-09-16,242.449997,0.617595,1.089341,143.04,1.1419,3873.33,30822.42,26.30,,,
2022-09-19,238.949997,0.626146,1.131800,,,3899.89,31019.68,25.76,,,
2022-09-20,239.750000,0.565024,0.999809,,,3855.93,30706.23,27.16,,,
2022-09-21,,0.506119,0.873356,,,3789.93,30183.78,27.99,,,


In [5]:
# Drop NA
data.dropna(inplace=True)
# View Statistics
data.describe()

Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
count,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0
mean,165.971046,-0.204629,-0.22275,109.257163,1.318698,3246.016249,27821.397191,19.621942,0.126927,0.151284,0.001492
std,73.329605,0.921308,0.874401,2.745018,0.053724,641.190353,3918.622111,8.977025,0.890099,0.876205,0.994523
min,69.393135,-1.158605,-3.287868,102.52,1.1492,2237.4,18591.93,9.14,-3.833531,-3.764914,-3.678735
25%,102.842384,-0.861342,-0.69863,107.3,1.2839,2761.63,25017.44,13.74,-0.269216,-0.232075,-0.412111
50%,140.505402,-0.64691,-0.248764,109.35,1.3141,2976.0,26478.02,17.36,0.091063,0.087661,0.053793
75%,214.686005,0.140023,0.261696,111.23,1.3631,3669.01,30015.51,22.52,0.532601,0.590284,0.626036
max,340.882782,2.107268,1.898265,116.12,1.4332,4796.56,36799.65,82.69,2.578314,2.211248,2.422419


In [6]:
# Assign X, Y (drop datetime index)
Y = data[data.columns[0]]
X = data[data.columns[1:]]
print(Y)
print(X)
# feature selection (correlation)
# Calculate correlation between variables for only continuous data columns
corr_data = X.corr()
# Reduce Corr() to Lower Matrix
lower_tri = corr_data.where(np.tril(np.ones(corr_data.shape),k=-1).astype(np.bool))
lower_tri.fillna(0, inplace=True)
# Drop columns if |correlation value| > 0.9
to_drop = [column for column in lower_tri.columns if any(lower_tri[column] > 0.9)]

X.drop(columns=to_drop, inplace=True)
X

2017-09-25     69.393135
2017-09-26     69.411926
2017-09-27     69.994499
2017-09-28     70.107262
2017-09-29     69.778397
                 ...    
2021-12-28    337.117401
2021-12-29    334.136902
2021-12-30    332.577057
2022-01-03    314.326324
2022-01-04    311.842529
Name: (Adj Close, MSFT), Length: 1061, dtype: float64
            (Adj Close, GOOGL)  (Adj Close, IBM)  DEXJPUS  DEXUSUK    SP500  \
2017-09-25           -1.158605         -0.510042   111.64   1.3457  2496.66   
2017-09-26           -1.153617         -0.462041   112.16   1.3422  2496.84   
2017-09-27           -1.118037         -0.524650   112.76   1.3401  2507.04   
2017-09-28           -1.110262         -0.524650   112.66   1.3435  2510.06   
2017-09-29           -1.096153         -0.565000   112.64   1.3402  2519.36   
...                        ...               ...      ...      ...      ...   
2021-12-28            2.007462          1.144692   114.75   1.3432  4786.35   
2021-12-29            2.006449         

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lower_tri = corr_data.where(np.tril(np.ones(corr_data.shape),k=-1).astype(np.bool))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=to_drop, inplace=True)


Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-25,-0.510042,111.64,1.3457,22296.09,10.21,0.238260,0.144334,-0.149099
2017-09-26,-0.462041,112.16,1.3422,22284.32,10.17,0.154579,-0.000734,-0.133418
2017-09-27,-0.524650,112.76,1.3401,22340.71,9.87,0.167780,0.104534,-0.184915
2017-09-28,-0.524650,112.66,1.3435,22381.20,9.55,0.171590,0.037761,-0.165374
2017-09-29,-0.565000,112.64,1.3402,22405.09,9.51,-0.034732,-0.162453,-0.182193
...,...,...,...,...,...,...,...,...
2021-12-28,1.144692,114.75,1.3432,36398.21,17.54,-2.969445,-3.243270,-3.640177
2021-12-29,1.208765,114.97,1.3475,36488.63,16.95,-3.356613,-3.682187,-3.678735
2021-12-30,1.258597,115.17,1.3500,36398.08,17.33,-3.833531,-3.376943,-3.609646
2022-01-03,1.448141,115.27,1.3469,36585.06,16.60,-3.022915,-3.764914,-3.639374


In [7]:
# Train / Test Preparation (try 2 Option)
# Option#1
test_size = (0.3 * len( X )).__round__()
train_size = (0.7 * len( X )).__round__()
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
# Option #2
seed = 4
# X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=seed)
X_train

Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-25,-0.510042,111.64,1.3457,22296.09,10.21,0.238260,0.144334,-0.149099
2017-09-26,-0.462041,112.16,1.3422,22284.32,10.17,0.154579,-0.000734,-0.133418
2017-09-27,-0.524650,112.76,1.3401,22340.71,9.87,0.167780,0.104534,-0.184915
2017-09-28,-0.524650,112.66,1.3435,22381.20,9.55,0.171590,0.037761,-0.165374
2017-09-29,-0.565000,112.64,1.3402,22405.09,9.51,-0.034732,-0.162453,-0.182193
...,...,...,...,...,...,...,...,...
2020-09-14,-0.909953,105.68,1.2876,27993.33,25.85,0.255291,0.263408,0.303105
2020-09-15,-0.882008,105.50,1.2866,27995.60,25.59,-0.091887,0.272168,0.162057
2020-09-16,-0.739890,104.94,1.2980,28032.38,26.04,-0.040850,0.533234,0.413052
2020-09-17,-0.684000,104.77,1.2955,27901.98,26.46,0.156122,0.641516,0.560070


GridSearchCV() 

In [8]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR(), }

# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)

# Create Parameter Dictionary for SVR
kernel = ['linear', 'rbf', 'poly']
C_list = [10, 100]
ep_list = [0.1, 1, 5]
gamma = [0.01, 0.1]
degree = [2, 3]
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

# Cross Validation Model
# set k-fold crossvalidation with shuffle
num_fold = 3
seed = 500
kfold = KFold(n_splits=num_fold, shuffle = True, random_state=seed)
import warnings
warnings.filterwarnings('ignore')

In [101]:
# GridSearchCV() -> (a) 
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR
    grid = GridSearchCV(estimator=model, 
                        n_jobs = 1, 
                        verbose = 10,
                        cv = kfold,
                        scoring = 'neg_mean_squared_error',
                        param_grid = params )
    grid_result = grid.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 1/3; 1/4] END fit_intercept=True, normalize=True;, score=-302.990 total time=   0.0s
[CV 2/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 2/3; 1/4] END fit_intercept=True, normalize=True;, score=-258.983 total time=   0.0s
[CV 3/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 3/3; 1/4] END fit_intercept=True, normalize=True;, score=-260.005 total time=   0.0s
[CV 1/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 1/3; 2/4] END fit_intercept=True, normalize=False;, score=-302.990 total time=   0.0s
[CV 2/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 2/3; 2/4] END fit_intercept=True, normalize=False;, score=-258.983 total time=   0.0s
[CV 3/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 3/3

In [None]:
# Show Best Parameters for both models
print('Best params: ',grid_result.best_params_)
print('Best score: ', grid_result.best_score_)

In [None]:
# Show Score for each parameter combination for both model
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
 # Display Mean, std, params
grid_result.Bar()

RandomizedSearchCV()

In [None]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR(), }

In [None]:
# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)

In [None]:
# Create Parameter Dictionary for SVR
kernel = ['linear', 'rbf', 'poly']
C_list = list(np.linspace(0.1, 150, 5, dtype = float))
ep_list = list(np.linspace(0.1, 1, 5, dtype = float))
gamma = list(np.linspace(0.01, 0.1, 5, dtype = float))
degree = [2, 3]
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

# Cross Validation Model
# set k-fold crossvalidation with shuffle
num_fold = 3
seed = 500
kfold = KFold(n_splits=num_fold, shuffle = True, random_state=seed)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Show scatter plot compare y_test vs each model prediction
plt.scatter()

In [None]:
# GridSearchCV() -> (a) 
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR
    grid = GridSearchCV(estimator=model, 
                        n_jobs = 1, 
                        verbose = 10,
                        cv = kfold,
                        scoring = 'neg_mean_squared_error',
                        param_grid = params )
    grid_result = grid.fit(X_train, Y_train)

In [None]:
# Show Best Parameters for both models
print('Best params: ',grid_result.best_params_)
print('Best score: ', grid_result.best_score_)

In [None]:
# Show Score for each parameter combination for both model
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Display Mean, std, params
grid_result.Bar()

In [None]:
# Show scatter plot compare y_test vs each model prediction
plt.scatter()