Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
import pandas_datareader.data as web
from sklearn.preprocessing import StandardScaler

Data Preparation

In [3]:
#Read stock data use pandas_datareader.data from web
# Get Stock Data
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo')
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')

In [4]:
# Select columns
Base = stk_data.loc[:, ('Adj Close', 'MSFT')]
X1 = stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]
X2 = ccy_data
X3 = idx_data

In [5]:
#Standardized data (X1, X2, X3) with kept index (date)
standard = StandardScaler()
X1_standard = standard.fit_transform(X1.values)
X2_standard = standard.fit_transform(X2.values)
X3_standard = standard.fit_transform(X3.values)

In [6]:
#Make standardize to dataframe
X1 = pd.DataFrame(index=X1.index, data=X1_standard, columns=X1.columns)
X2 = pd.DataFrame(index=X2.index, data=X2_standard, columns=X2.columns)
X3 = pd.DataFrame(index=X3.index, data=X3_standard, columns=X3.columns)

In [7]:
#Calculate ความแตกต่างของค่า ราคา 'Adj Close', 'MSFT’)ย้อนหลัง return_period วัน 
return_period = 7
Y = Base.shift(return_period)
X4_3DT = Base.diff(3*return_period).shift(-3*return_period)
X4_6DT = Base.diff(6*return_period).shift(-6*return_period)
X4_12DT = Base.diff(12*return_period).shift(-12*return_period)
X4 = pd.concat([X4_3DT, X4_6DT, X4_12DT], axis=1)
X4.columns = ['MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X4 = pd.DataFrame(standard.fit_transform(X4.values), index = X4.index,columns=X4.columns)
X4

Unnamed: 0_level_0,MSFT_3DT,MSFT_6DT,MSFT_12DT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-09-18,-0.078815,0.054751,0.042181
2017-09-19,-0.097244,0.051460,0.026954
2017-09-20,-0.040538,0.034466,0.111717
2017-09-21,0.075002,0.082574,0.152595
2017-09-22,0.062243,0.138246,0.141373
...,...,...,...
2022-09-09,,,
2022-09-12,,,
2022-09-13,,,
2022-09-14,,,


In [8]:
#Forming Dataset
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1)
dataset

Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-18,,-1.157746,-0.596743,-0.034125,0.632440,-1.266457,-1.503384,-1.182579,-0.078815,0.054751,0.042181
2017-09-19,,-1.146509,-0.607887,-0.032778,0.687607,-1.262425,-1.493851,-1.179130,-0.097244,0.051460,0.026954
2017-09-20,,-1.129628,-0.504112,-0.031431,0.768632,-1.260119,-1.483753,-1.225117,-0.040538,0.034466,0.111717
2017-09-21,,-1.129612,-0.547295,0.076321,0.789320,-1.271199,-1.496647,-1.237764,0.075002,0.082574,0.152595
2017-09-22,,-1.136393,-0.556349,0.037261,0.711742,-1.268850,-1.498977,-1.246961,0.062243,0.138246,0.141373
...,...,...,...,...,...,...,...,...,...,...,...
2022-09-09,262.970001,0.870519,1.273789,4.135893,-2.617214,1.000920,0.869685,0.270620,,,
2022-09-12,261.470001,0.877157,1.409625,,,1.063351,0.925175,0.394786,,,
2022-09-13,260.399994,0.670419,1.094523,,,0.805621,0.616743,0.785678,,,
2022-09-14,256.059998,0.691915,1.135181,,,0.824938,0.624021,0.658064,,,


In [9]:
# Drop NA
dataset.dropna(inplace=True)
# View Statistics
dataset.describe()

Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
count,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0
mean,174.605312,-0.059991,-0.111706,-0.217478,0.11584,-0.056654,-0.05046,-0.033907,0.003854,-0.009991,-0.000191
std,78.337805,1.000899,0.920901,0.578203,0.911471,1.001267,1.003905,1.021889,0.965643,1.002853,1.002489
min,68.838737,-1.110092,-3.285759,-1.240952,-2.803401,-1.652892,-2.407008,-1.298697,-3.775279,-3.499621,-3.278044
25%,103.177277,-0.836808,-0.637526,-0.567499,-0.460978,-0.862209,-0.815948,-0.713796,-0.407847,-0.404829,-0.335159
50%,152.207962,-0.552317,-0.185133,-0.289363,0.044572,-0.515412,-0.427149,-0.280653,0.045792,0.032269,0.015398
75%,235.8344,0.674252,0.505295,0.027159,0.831557,0.816624,0.946595,0.329254,0.429946,0.478855,0.52687
max,340.882812,2.109322,1.94508,2.586952,2.092629,2.058407,1.992851,7.157222,3.490883,2.735476,2.605375


In [10]:
# Assign X, Y (drop datetime index)
Y = dataset[dataset.columns[0]]
X = dataset[dataset.columns[1:]]
print(Y)
print(X)

2017-09-27     70.624077
2017-09-28     70.887177
2017-09-29     70.417351
2017-10-02     69.731407
2017-10-03     69.919342
                 ...    
2022-05-10    276.288666
2022-05-11    283.207855
2022-05-12    280.529785
2022-05-13    288.693390
2022-05-16    276.119446
Name: (Adj Close, MSFT), Length: 1150, dtype: float64
            (Adj Close, GOOGL)  (Adj Close, IBM)   DEXJPUS   DEXUSUK  \
2017-09-27           -1.110092         -0.519436  0.138279  0.487628   
2017-09-28           -1.102332         -0.519436  0.124810  0.546242   
2017-09-29           -1.088249         -0.559830  0.122116  0.489352   
2017-10-02           -1.098127         -0.449791  0.116729  0.258342   
2017-10-03           -1.090841         -0.441433  0.143667  0.229035   
...                        ...               ...       ...       ...   
2022-05-10            0.988903          1.119777  2.488628 -1.388034   
2022-05-11            0.963852          1.267610  2.487281 -1.381139   
2022-05-12            0

In [11]:
# feature selection (correlation)
# Calculate correlation between variables for only continuous data columns
dataset.reset_index(drop=True, inplace=True)
corr_data = X.corr()
# Reduce Corr() to Lower Matrix
lower_tri = corr_data.where(np.tril(np.ones(corr_data.shape),k=-1).astype(np.bool))
lower_tri.fillna(0, inplace=True)
# Drop columns if |correlation value| > 0.9
to_drop = [column for column in lower_tri.columns if any(lower_tri[column] > 0.9)]

X.drop(columns=to_drop, inplace=True)
X

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  lower_tri = corr_data.where(np.tril(np.ones(corr_data.shape),k=-1).astype(np.bool))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=to_drop, inplace=True)


Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-27,-0.519436,0.138279,0.487628,-1.501123,-1.214770,0.096975,0.177975,0.248244
2017-09-28,-0.519436,0.124810,0.546242,-1.491338,-1.251560,0.453518,0.233598,0.199985
2017-09-29,-0.559830,0.122116,0.489352,-1.485565,-1.256159,0.415240,0.112427,0.266472
2017-10-02,-0.449791,0.116729,0.258342,-1.448712,-1.263057,0.356407,0.152348,0.231506
2017-10-03,-0.441433,0.143667,0.229035,-1.428396,-1.256159,0.381215,0.176967,0.145767
...,...,...,...,...,...,...,...,...
2022-05-10,1.119777,2.488628,-1.388034,0.871867,1.443297,-0.558466,-1.308170,-0.737528
2022-05-11,1.267610,2.487281,-1.381139,0.792938,1.393861,-0.774566,-0.834514,-0.264778
2022-05-12,1.463809,2.204431,-1.586289,0.767853,1.303036,-1.191754,-0.445824,-0.668653
2022-05-13,1.527689,2.356632,-1.510435,0.880547,0.969628,-1.457219,-0.630829,-0.903505


In [12]:
# Train / Test Preparation
Test_size = (0.3 * len( X )).__round__()
Train_size = (0.7 * len( X )).__round__()
X_train, X_test = X[0:Train_size], X[Train_size:len(X)]
Y_train, Y_test = Y[0:Train_size], Y[Train_size:len(X)]
# print(Test_size)
# print(Train_size)

Model Training and Cross Validation

In [13]:
#Set number of fold / Seed value
Num_fold = 5
Seed = 11

In [14]:
# Cross Validation Model
# set k-fold crossvalidation with shuffle
kfold = model_selection.KFold(n_splits=Num_fold, shuffle = True, random_state=Seed)
kfold

KFold(n_splits=5, random_state=11, shuffle=True)

In [15]:
# Model selection
Model_LM = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
# cval [0.1, 1, 10, 100] #ลองอย่างน้อย 3 ค่า
c_val = 100
svr_lin  = SVR(kernel='linear', C=c_val)
svr_rbf  = SVR(kernel='rbf',    C=c_val, gamma=0.01)
svr_poly = SVR(kernel='poly',   C=c_val, degree=2)

    # Calculate accuracy score for each model
score_LM = model_selection.cross_val_score(Model_LM, X_train, Y_train, cv=kfold)
score_lin = model_selection.cross_val_score(svr_lin, X_train, Y_train, cv=kfold)
score_rbf = model_selection.cross_val_score(svr_rbf, X_train, Y_train, cv=kfold)
score_poly = model_selection.cross_val_score(svr_poly, X_train, Y_train, cv=kfold)

# View score k-fold 
# Valication score comparison
score = pd.DataFrame({'Linear Model':score_LM,'SVR_linear':score_lin, 'SVR_rbf': score_rbf, 'SVR_poly': 
score_poly})
score_mean = pd.DataFrame({'AVG Linear Model':[score_LM.mean()],'AVG SVR_linear':[score_lin.mean()], 
'AVG SVR_rbf': [score_rbf.mean()], 'AVG SVC_poly': [score_poly.mean()]})

print(score)
print(score_mean)




   Linear Model  SVR_linear   SVR_rbf  SVR_poly
0      0.901325    0.897019  0.935747  0.912373
1      0.899901    0.889847  0.933921  0.904850
2      0.839329    0.829535  0.907623  0.895205
3      0.885015    0.878133  0.920113  0.910098
4      0.861561    0.830345  0.887130  0.905503
   AVG Linear Model  AVG SVR_linear  AVG SVR_rbf  AVG SVC_poly
0          0.877426        0.864976     0.916907      0.905606




Model Evaluation

In [16]:
# Predict all models ( LM, SVR_linear, SVR_rbf, SVR Poly )
LM_pred = Model_LM.fit(X_train, Y_train).predict(X_test)
LM_pred



array([216.20112848, 218.29892187, 215.30816959, 218.28946984,
       224.6208148 , 221.22879923, 225.08753606, 223.56250618,
       224.14234935, 229.59008563, 225.34351782, 224.16262275,
       224.57718339, 223.82606603, 222.57332818, 223.68374369,
       226.7428769 , 228.00195615, 226.53041662, 244.92612771,
       234.47731913, 231.34541587, 231.17898736, 232.33581434,
       225.67241529, 226.07297737, 225.48277388, 230.12884441,
       230.39514415, 231.67581216, 229.64924314, 227.88513762,
       229.09304037, 230.60436351, 229.50593221, 227.94141874,
       228.58317949, 228.7888833 , 228.1201951 , 233.86909606,
       224.27270598, 222.75215503, 221.11712724, 221.17537598,
       214.60464928, 214.89238102, 217.97365448, 217.63547848,
       218.02449897, 219.75932172, 217.50381854, 219.31796442,
       218.86478535, 218.7619606 , 220.78857782, 217.15440821,
       215.1685637 , 212.77331001, 214.92882684, 213.05546191,
       215.23919603, 220.27661517, 216.15995329, 213.02

In [21]:
# Scatter Plot ( X_test, Predict ) for all model ( LM, SVR_linear, SVR_rbf, SVR Poly )
# plt.scatter(X_test['(Adj Close, IBM)'], LM_pred,c='magenta')
X_test

Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2020-12-28,-0.542284,-1.063160,0.566930,0.447348,0.145305,0.787227,0.328469,0.606614
2020-12-29,-0.624997,-1.108955,0.678987,0.430843,0.303961,0.329402,0.195598,0.626724
2020-12-30,-0.581208,-1.134546,0.839314,0.448698,0.268321,1.084522,-0.030314,0.716910
2020-12-31,-0.456329,-1.150709,0.937580,0.496284,0.266022,1.019117,-0.122444,0.514686
2021-01-04,-0.613643,-1.150709,0.746221,0.403832,0.751188,1.630050,0.440866,0.657643
...,...,...,...,...,...,...,...,...
2022-05-10,1.119777,2.488628,-1.388034,0.871867,1.443297,-0.558466,-1.308170,-0.737528
2022-05-11,1.267610,2.487281,-1.381139,0.792938,1.393861,-0.774566,-0.834514,-0.264778
2022-05-12,1.463809,2.204431,-1.586289,0.767853,1.303036,-1.191754,-0.445824,-0.668653
2022-05-13,1.527689,2.356632,-1.510435,0.880547,0.969628,-1.457219,-0.630829,-0.903505


In [None]:
# Model prediction performance evaluation for all model ( LM, SVR_linear, SVR_rbf, SVR Poly )
# MSE
LM_MSE = metrics.mean_squared_error(Y_test, LM_pred)
# R2
LM_r2 = metrics.r2_score(Y_test, LM_pred)
print(LM_MSE)
print(LM_r2)

In [None]:
# Display Prediction MSE, R2 for all models
plt.bar(["LM_r2","svr_lin_r2","svr_rbf_r2","svr_poly_r2"],["LM_r2","svr_lin_r2","svr_rbf_r2","svr_poly_r2"])
plt.show()