# Stock Price Prediction using Support Vector Machines

In [1]:
import quandl
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

#### Getting Facebook stock prices from quandl

In [2]:
df = quandl.get('WIKI/FB')

In [3]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-05-18,42.05,45.0,38.0,38.2318,573576400.0,0.0,1.0,42.05,45.0,38.0,38.2318,573576400.0
2012-05-21,36.53,36.66,33.0,34.03,168192700.0,0.0,1.0,36.53,36.66,33.0,34.03,168192700.0
2012-05-22,32.61,33.59,30.94,31.0,101786600.0,0.0,1.0,32.61,33.59,30.94,31.0,101786600.0
2012-05-23,31.37,32.5,31.36,32.0,73600000.0,0.0,1.0,31.37,32.5,31.36,32.0,73600000.0
2012-05-24,32.95,33.21,31.77,33.03,50237200.0,0.0,1.0,32.95,33.21,31.77,33.03,50237200.0


In [4]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-03-21,164.8,173.4,163.3,169.39,105350867.0,0.0,1.0,164.8,173.4,163.3,169.39,105350867.0
2018-03-22,166.13,170.27,163.72,164.89,73389988.0,0.0,1.0,166.13,170.27,163.72,164.89,73389988.0
2018-03-23,165.44,167.1,159.02,159.39,52306891.0,0.0,1.0,165.44,167.1,159.02,159.39,52306891.0
2018-03-26,160.82,161.1,149.02,160.06,125438294.0,0.0,1.0,160.82,161.1,149.02,160.06,125438294.0
2018-03-27,156.31,162.85,150.75,152.19,76787884.0,0.0,1.0,156.31,162.85,150.75,152.19,76787884.0


In [5]:
df = df[['Adj. Close']]

In [6]:
df

Unnamed: 0_level_0,Adj. Close
Date,Unnamed: 1_level_1
2012-05-18,38.2318
2012-05-21,34.0300
2012-05-22,31.0000
2012-05-23,32.0000
2012-05-24,33.0300
...,...
2018-03-21,169.3900
2018-03-22,164.8900
2018-03-23,159.3900
2018-03-26,160.0600


In [7]:
# A variable for predicting n days into the future
            
forecast = 1

In [8]:
# Create another column (the target variable) shifted n units up

df['Prediction'] = df['Adj. Close'].shift(-1)

In [9]:
df

Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-05-18,38.2318,34.03
2012-05-21,34.0300,31.00
2012-05-22,31.0000,32.00
2012-05-23,32.0000,33.03
2012-05-24,33.0300,31.91
...,...,...
2018-03-21,169.3900,164.89
2018-03-22,164.8900,159.39
2018-03-23,159.3900,160.06
2018-03-26,160.0600,152.19


In [10]:
df['Prediction'] = df['Adj. Close'].shift(-forecast)

In [11]:
df

Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-05-18,38.2318,34.03
2012-05-21,34.0300,31.00
2012-05-22,31.0000,32.00
2012-05-23,32.0000,33.03
2012-05-24,33.0300,31.91
...,...,...
2018-03-21,169.3900,164.89
2018-03-22,164.8900,159.39
2018-03-23,159.3900,160.06
2018-03-26,160.0600,152.19


In [12]:
# Shifting the data out by 30 days

forecast = 30

df['Prediction'] = df['Adj. Close'].shift(-forecast)

In [13]:
df

Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-05-18,38.2318,30.771
2012-05-21,34.0300,31.200
2012-05-22,31.0000,31.470
2012-05-23,32.0000,31.730
2012-05-24,33.0300,32.170
...,...,...
2018-03-21,169.3900,
2018-03-22,164.8900,
2018-03-23,159.3900,
2018-03-26,160.0600,


### Create the independent dataset X

In [14]:
# Convert the dataframe to a numpy array

X = np.array(df[['Adj. Close']])

In [15]:
len(X)

1472

In [16]:
X

array([[ 38.2318],
       [ 34.03  ],
       [ 31.    ],
       ...,
       [159.39  ],
       [160.06  ],
       [152.19  ]])

In [17]:
# Removing the number of forecast values from the dataframe

X = X[:-forecast]

In [18]:
X

array([[ 38.2318],
       [ 34.03  ],
       [ 31.    ],
       ...,
       [171.5499],
       [175.98  ],
       [176.41  ]])

In [19]:
len(X)

1442

### Create the dependent dataset y

In [20]:
y = np.array(df['Prediction'])

In [21]:
len(y)

1472

In [22]:
y

array([30.771, 31.2  , 31.47 , ...,    nan,    nan,    nan])

In [23]:
# Get all of the values except the last n rows

y = y[:-forecast]

In [24]:
len(y)

1442

In [25]:
y

array([ 30.771,  31.2  ,  31.47 , ..., 159.39 , 160.06 , 152.19 ])

### Splitting the data into training and testing data

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Create and train our model

#### Support Vector Machine (Regressor)

In [27]:
svr = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)

In [28]:
svr.fit(X_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

#### Testing the model
#### Score returns the coefficient of determination of R^2 of the prediction
#### The best possible score is 1.0

In [29]:
svm_confidence = svr.score(X_test, y_test)

In [30]:
svm_confidence

0.9851463090929409

#### Create and train the Linear Regression Model

In [31]:
lr = LinearRegression()

#### Train the model

In [32]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Measuring the model performance

In [33]:
lr_confidence = lr.score(X_test, y_test)

In [34]:
lr_confidence

0.9814246737967903

#### Set x_forecast equal to the last n rows of the original data set from "Adj.Close" column

In [35]:
x_forecast = np.array(df[['Adj. Close']])[-forecast:]

In [36]:
x_forecast

array([[173.15],
       [179.52],
       [179.96],
       [177.36],
       [176.01],
       [177.91],
       [178.99],
       [183.29],
       [184.93],
       [181.46],
       [178.32],
       [175.94],
       [176.62],
       [180.4 ],
       [179.78],
       [183.71],
       [182.34],
       [185.23],
       [184.76],
       [181.88],
       [184.19],
       [183.86],
       [185.09],
       [172.56],
       [168.15],
       [169.39],
       [164.89],
       [159.39],
       [160.06],
       [152.19]])

In [37]:
len(x_forecast)

30

In [38]:
# Linear regression predictions for the next n days
# n = 30 in our case

lr_prediction = lr.predict(x_forecast)
lr_prediction

array([177.12904033, 183.56224086, 184.00660636, 181.38081022,
       180.01741608, 181.9362671 , 183.02698241, 187.36964525,
       189.02591666, 185.52148874, 182.35033495, 179.94672157,
       180.63346825, 184.45097186, 183.82482047, 187.79381232,
       186.41021974, 189.32889314, 188.85422999, 185.94565581,
       188.27857468, 187.94530056, 189.18750411, 176.53318659,
       172.07943238, 173.33173515, 168.787088  , 163.23251925,
       163.90916672, 155.96108381])

In [39]:
# Support Vector regression predictions for the next n days
# n = 30 in our case

svr_prediction = svr.predict(x_forecast)
svr_prediction

array([174.6717569 , 179.39122576, 178.31857071, 178.83300896,
       174.94930442, 180.28371738, 180.61098157, 187.10842463,
       179.36680421, 181.49167489, 180.90795537, 174.77245196,
       176.60776749, 177.87090387, 178.71833805, 185.15758087,
       186.67025426, 179.13401666, 179.7697387 , 184.1613489 ,
       182.34324364, 184.29229332, 179.16741586, 176.77064872,
       172.00149415, 172.27249123, 172.16610482, 167.63320194,
       166.21510732, 161.72092548])