In [None]:
# A program to predict stock prices by using different machine learning models

import quandl
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR 
from sklearn.model_selection import train_test_split

In [40]:
quandl.ApiConfig.api_key = '#PUT THE QUANDL API KEY HERE'
# Open the stock data
df = quandl.get("WIKI/AMZN")
print(df.head())

             Open   High    Low  Close     Volume  Ex-Dividend  Split Ratio  \
Date                                                                          
1997-05-16  22.38  23.75  20.50  20.75  1225000.0          0.0          1.0   
1997-05-19  20.50  21.25  19.50  20.50   508900.0          0.0          1.0   
1997-05-20  20.75  21.00  19.63  19.63   455600.0          0.0          1.0   
1997-05-21  19.25  19.75  16.50  17.13  1571100.0          0.0          1.0   
1997-05-22  17.25  17.38  15.75  16.75   981400.0          0.0          1.0   

            Adj. Open  Adj. High  Adj. Low  Adj. Close  Adj. Volume  
Date                                                                 
1997-05-16   1.865000   1.979167  1.708333    1.729167   14700000.0  
1997-05-19   1.708333   1.770833  1.625000    1.708333    6106800.0  
1997-05-20   1.729167   1.750000  1.635833    1.635833    5467200.0  
1997-05-21   1.604167   1.645833  1.375000    1.427500   18853200.0  
1997-05-22   1.437500   1.

In [41]:
# Use the adj. close
df = df[['Adj. Close']]
print(df.head())

            Adj. Close
Date                  
1997-05-16    1.729167
1997-05-19    1.708333
1997-05-20    1.635833
1997-05-21    1.427500
1997-05-22    1.395833


In [42]:
# n number of days in future to predict the price
n = 30
forecast_days = n

# forecasted variable shift n units
df['Prediction'] = df[['Adj. Close']].shift(-n)
print(df.tail())

            Adj. Close  Prediction
Date                              
2018-03-21     1581.86         NaN
2018-03-22     1544.10         NaN
2018-03-23     1495.56         NaN
2018-03-26     1555.86         NaN
2018-03-27     1497.05         NaN


In [43]:
# create independent dataset (X)
# convert the df to numpy list
X = np.array(df.drop(['Prediction'], 1))

# remove the last 'n' rows 
X = X[:-forecast_days]
print(X)

[[   1.72916667]
 [   1.70833333]
 [   1.63583333]
 ...
 [1350.47      ]
 [1338.99      ]
 [1386.23      ]]


In [44]:
# create dependent dataset (Y)
# Convert the df to numpy list (all values including NaNs)
Y = np.array(df['Prediction'])

# get all y values except last n rows

Y = Y[:-forecast_days]
print(Y)

[1.54166667e+00 1.51583333e+00 1.58833333e+00 ... 1.49556000e+03
 1.55586000e+03 1.49705000e+03]


In [45]:
# splitting the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [46]:
# Create and train the SVM (Support Vector Machine Regressor)
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_rbf.fit(x_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [47]:
# Testing Model: Score returns the coefficient of the determinaiton of R^2 of the prediction
# best 1.0, the higher the better
svm_confidence = svr_rbf.score(x_test, y_test)
print("svm confidence: ", svm_confidence)

svm confidence:  0.935557482815682


In [48]:
# create and train a Linear Regression Model
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [49]:
# Test the Linear Regression Model
lr_confidence = lr.score(x_test, y_test)
print("lr condfidence: ", lr_confidence)

lr condfidence:  0.9877402123033054


In [50]:
# Set x_forecast equal to the last 30 rows of the original dataset from Adj. Close cloumn
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_days:]
print(x_forecast)

[[1414.51]
 [1451.05]
 [1461.76]
 [1448.69]
 [1468.35]
 [1482.92]
 [1484.76]
 [1500.  ]
 [1521.95]
 [1511.98]
 [1512.45]
 [1493.45]
 [1500.25]
 [1523.61]
 [1537.64]
 [1545.  ]
 [1551.86]
 [1578.89]
 [1598.39]
 [1588.18]
 [1591.  ]
 [1582.32]
 [1571.68]
 [1544.93]
 [1586.51]
 [1581.86]
 [1544.1 ]
 [1495.56]
 [1555.86]
 [1497.05]]


In [51]:
# n days forecast using Linear Regression Model
lr_prediction = lr.predict(x_forecast)
print(lr_prediction)

print("\n\n")

# n days forecast using SVRM
svm_prediction = svr_rbf.predict(x_forecast)
print(svm_prediction)

[1497.6034403  1536.37054573 1547.73331801 1533.86670422 1554.72497714
 1570.18301562 1572.13516324 1588.30403808 1611.59188602 1601.01421659
 1601.51286299 1581.35481693 1588.56927552 1613.35306268 1628.23818827
 1636.04677875 1643.32489432 1672.00236722 1692.69088817 1681.85859079
 1684.85046921 1675.64142501 1664.35291921 1635.97251226 1680.08680464
 1675.1533881  1635.09192393 1583.593421   1647.56869349 1585.17423619]



[1052.41962451  668.64110929  667.64100192  695.25914024  667.64100192
  667.64100192  667.64100192  667.64100192  667.64100192  667.64100192
  667.64100192  667.64100192  667.64100192  667.64100192  667.64100192
  667.64100192  667.64100192  667.64100192  667.64100192  667.64100192
  667.64100192  667.64100192  667.64100192  667.64100192  667.64100192
  667.64100192  667.64100192  667.64100192  667.64100192  667.64100192]
