In [217]:
# Import Initial Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf

In [218]:
# load in AAPL data
dataset = yf.download(tickers='AAPL', period='6mo', interval = '1d')

[*********************100%***********************]  1 of 1 completed


In [219]:
# Preprocess the data
dataset.reset_index(inplace=True)
dataset.rename(columns = {'index':'Date'}, inplace = True)

In [220]:
SMA_20 = pd.Series(dataset.Close).rolling(20).mean()
SMA_50 = pd.Series(dataset.Close).rolling(50).mean()

In [221]:
dataset['SMA_20'] = SMA_20
dataset['SMA_50'] = SMA_50

In [222]:
# drop rows without SMA_20 or SMA_50 values
dataset = dataset.iloc[50:, :]

In [223]:
aapl = dataset[['Volume', 'SMA_20', 'SMA_50', 'Open', 'High', 'Low', 'Close']]

In [224]:
X = aapl.iloc[:, :-1].values
y = aapl.iloc[:, -1].values

In [225]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [226]:
# Train the model
from sklearn.linear_model import LinearRegression
# create instance of linearRegression class
regressor = LinearRegression()
# fit the model with training set
regressor.fit(X_train, y_train)

LinearRegression()

In [227]:
# predict test set results
y_pred = regressor.predict(X_test)

In [228]:
# Set precision values for nicer printing
np.set_printoptions(precision=2)

In [229]:
# Compare predictions to test results
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), 1))

[[138.06 139.07]
 [126.01 125.57]
 [130.6  132.05]
 [127.17 127.9 ]
 [122.88 123.99]
 [135.35 134.99]
 [122.13 125.86]
 [135.26 135.13]
 [135.1  135.37]
 [123.21 122.54]
 [126.22 127.79]
 [120.11 120.13]
 [135.71 136.69]
 [120.1  121.03]
 [134.25 133.94]]


From looking at our results - they're looking pretty good! Let's use Backward Elimination to see if any of the coefficients could be excluded.

In [230]:
# Tune the mode using Backward Elimination
import statsmodels.api as sm
X = np.append(arr = np.ones((74, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.972
Model:,OLS,Adj. R-squared:,0.97
Method:,Least Squares,F-statistic:,470.5
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,2.69e-51
Time:,20:59:38,Log-Likelihood:,-109.59
No. Observations:,74,AIC:,231.2
Df Residuals:,68,BIC:,245.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.4171,6.724,0.508,0.613,-10.001,16.835
x1,-3.032e-08,4.81e-09,-6.309,0.000,-3.99e-08,-2.07e-08
x2,-0.0319,0.043,-0.744,0.459,-0.117,0.054
x3,0.0346,0.048,0.722,0.473,-0.061,0.130
x4,-0.3616,0.115,-3.144,0.002,-0.591,-0.132
x5,1.3414,0.117,11.463,0.000,1.108,1.575

0,1,2,3
Omnibus:,2.38,Durbin-Watson:,2.195
Prob(Omnibus):,0.304,Jarque-Bera (JB):,2.052
Skew:,-0.408,Prob(JB):,0.359
Kurtosis:,2.985,Cond. No.,5770000000.0


The largest P-value is 0.613 on the constant, so let's drop the constant to see how our model does.

In [231]:
X_opt = X[:, [1, 2, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,201800.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,3.22e-142
Time:,20:59:39,Log-Likelihood:,-109.73
No. Observations:,74,AIC:,229.5
Df Residuals:,69,BIC:,241.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-3.014e-08,4.77e-09,-6.323,0.000,-3.96e-08,-2.06e-08
x2,-0.0369,0.042,-0.888,0.378,-0.120,0.046
x3,0.0551,0.026,2.150,0.035,0.004,0.106
x4,-0.3652,0.114,-3.200,0.002,-0.593,-0.138
x5,1.3559,0.113,12.012,0.000,1.131,1.581

0,1,2,3
Omnibus:,2.939,Durbin-Watson:,2.199
Prob(Omnibus):,0.23,Jarque-Bera (JB):,2.478
Skew:,-0.447,Prob(JB):,0.29
Kurtosis:,3.07,Cond. No.,138000000.0


The largest P-value is .387 on coefficient x2, so let's drop x2 to see how our model performs.

In [232]:
X_opt = X[:, [1, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,253100.0
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,8.88e-145
Time:,20:59:39,Log-Likelihood:,-110.15
No. Observations:,74,AIC:,228.3
Df Residuals:,70,BIC:,237.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-3.057e-08,4.73e-09,-6.456,0.000,-4e-08,-2.11e-08
x2,0.0375,0.016,2.315,0.024,0.005,0.070
x3,-0.3746,0.113,-3.300,0.002,-0.601,-0.148
x4,1.3463,0.112,12.000,0.000,1.123,1.570

0,1,2,3
Omnibus:,4.218,Durbin-Watson:,2.147
Prob(Omnibus):,0.121,Jarque-Bera (JB):,3.615
Skew:,-0.534,Prob(JB):,0.164
Kurtosis:,3.18,Cond. No.,138000000.0


All P-values are lower than the threshold 0.05, which means all coefficients are statistically significant. Let's rerun the model to see the predictions with our newly generated model.

In [233]:
# Split data into training and test sets
# X_opt only includes SMA_20, Open, High, and Low
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_opt, y, test_size = 0.2, random_state = 1)

In [234]:
# Train the model
from sklearn.linear_model import LinearRegression
# create instance of linearRegression class
regressor = LinearRegression()
# fit the model with training set
regressor.fit(X_train, y_train)

LinearRegression()

In [235]:
# predict test set results
y_pred = regressor.predict(X_test)

In [236]:
# Set precision values for nicer printing
np.set_printoptions(precision=2)

In [237]:
# Compare predictions to test results
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), 1))

[[138.06 139.07]
 [125.41 125.57]
 [130.41 132.05]
 [127.18 127.9 ]
 [123.32 123.99]
 [134.84 134.99]
 [123.98 125.86]
 [135.55 135.13]
 [135.05 135.37]
 [122.96 122.54]
 [126.84 127.79]
 [120.08 120.13]
 [135.17 136.69]
 [120.23 121.03]
 [133.96 133.94]]


In [255]:
# Let's try just using SMA (20 & 50) and Open, since High and Low is not going to be available immediately
aapl = dataset[['SMA_20', 'SMA_50', 'Open', 'Close']]

In [256]:
X = aapl.iloc[:, :-1].values
y = aapl.iloc[:, -1].values

In [257]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [258]:
# Train the model
from sklearn.linear_model import LinearRegression
# create instance of linearRegression class
regressor = LinearRegression()
# fit the model with training set
regressor.fit(X_train, y_train)

LinearRegression()

In [259]:
# predict test set results
y_pred = regressor.predict(X_test)

In [243]:
# Set precision values for nicer printing
np.set_printoptions(precision=2)

In [260]:
# Compare predictions to test results
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), 1))

[[135.56 139.07]
 [125.27 125.57]
 [132.24 132.05]
 [125.64 127.9 ]
 [121.24 123.99]
 [134.79 134.99]
 [123.24 125.86]
 [134.7  135.13]
 [133.23 135.37]
 [123.12 122.54]
 [123.24 127.79]
 [121.4  120.13]
 [133.93 136.69]
 [120.27 121.03]
 [134.79 133.94]]


In [277]:
# load in AAPL data
dataset = yf.download(tickers='AAPL', period='6mo', interval = '1d')

[*********************100%***********************]  1 of 1 completed


In [278]:
SMA_20 = pd.Series(dataset.Close).rolling(20).mean()
SMA_50 = pd.Series(dataset.Close).rolling(50).mean()

In [279]:
dataset['SMA_20'] = SMA_20
dataset['SMA_50'] = SMA_50

In [280]:
dataset['SMA_20'] = dataset['SMA_20'].shift(1)
dataset['SMA_50'] = dataset['SMA_50'].shift(1)

In [281]:
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_20,SMA_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-10-13,125.269997,125.389999,119.650002,121.099998,120.711044,262330500,,
2020-10-14,121.000000,123.029999,119.620003,121.190002,120.800766,151062300,,
2020-10-15,118.720001,121.199997,118.150002,120.709999,120.322304,112559200,,
2020-10-16,121.279999,121.550003,118.809998,119.019997,118.637726,115393800,,
2020-10-19,119.959999,120.419998,115.660004,115.980003,115.607498,120639300,,
...,...,...,...,...,...,...,...,...
2021-04-06,126.500000,127.129997,125.650002,126.209999,126.209999,80171300,121.771000,127.7460
2021-04-07,125.830002,127.919998,125.139999,127.900002,127.900002,83466700,122.263499,127.4888
2021-04-08,128.949997,130.389999,128.520004,130.360001,130.360001,88844600,122.604000,127.1884
2021-04-09,129.800003,133.039993,129.470001,133.000000,133.000000,106513800,123.123000,126.9324


In [282]:
# drop rows without SMA_20 or SMA_50 values
dataset = dataset.iloc[50:, :]

In [283]:
aapl = dataset[['Volume', 'SMA_20', 'SMA_50', 'Open', 'High', 'Low', 'Close']]

In [284]:
X = aapl.iloc[:, :-1].values
y = aapl.iloc[:, -1].values

In [285]:
X_opt = X[:, [1, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)

In [286]:
# Split data into training and test sets
# X_opt only includes SMA_20, Open, High, and Low
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_opt, y, test_size = 0.2, random_state = 1)

In [287]:
# Train the model
from sklearn.linear_model import LinearRegression
# create instance of linearRegression class
regressor = LinearRegression()
# fit the model with training set
regressor.fit(X_train, y_train)

LinearRegression()

In [288]:
# predict test set results
y_pred = regressor.predict(X_test)

In [289]:
# Set precision values for nicer printing
np.set_printoptions(precision=2)

In [290]:
# Compare predictions to test results
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), 1))

[[137.96 139.07]
 [126.03 125.57]
 [131.04 132.05]
 [126.8  127.9 ]
 [122.35 123.99]
 [135.38 134.99]
 [121.67 125.86]
 [134.72 135.13]
 [134.66 135.37]
 [123.06 122.54]
 [125.74 127.79]
 [120.55 120.13]
 [136.14 136.69]
 [119.86 121.03]
 [134.27 133.94]]


In [293]:
SMA_20_arr = np.array(SMA_20)

In [296]:
test_SMA_20 = SMA_20_arr[-1]

In [299]:
aapl.head(-5)

Unnamed: 0_level_0,Volume,SMA_20,SMA_50,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-12-23,88223700,123.316500,119.3242,132.160004,132.429993,130.779999,130.960007
2020-12-24,54930100,124.106000,119.5214,131.320007,133.460007,131.100006,131.970001
2020-12-28,124486200,124.903000,119.7370,133.990005,137.339996,133.509995,136.690002
2020-12-29,121047300,125.908001,120.0566,138.050003,138.789993,134.339996,134.869995
2020-12-30,96452100,126.699000,120.3736,135.580002,135.990005,133.399994,133.720001
...,...,...,...,...,...,...,...
2021-03-29,80819200,121.979999,128.5548,121.650002,122.580002,120.730003,121.389999
2021-03-30,85671900,121.659999,128.4044,120.110001,120.400002,118.860001,119.900002
2021-03-31,118323800,121.398999,128.2596,121.650002,123.519997,121.150002,122.150002
2021-04-01,74957400,121.403499,128.1460,123.660004,124.180000,122.489998,123.000000


In [300]:
print(regressor.predict([[test_SMA_20, 125.90, 127.00, 125.00]]))

[126.]
