# Predicting Price Returns

[Forecasting Financial Time Series - Part I](https://www.quantstart.com/articles/Forecasting-Financial-Time-Series-Part-1)

## Other references

http://francescopochetti.com/stock-market-prediction-part-introduction/


https://www.cs.princeton.edu/sites/default/files/uploads/saahil_madge.pdf

https://www.quantstart.com/articles



In [1]:
import datetime
import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.qda import QDA

In [2]:
symbol = 'SPY'
start_date = '2012-01-01'
end_date = '2016-12-31'
lags = 5

start_test = '2016-06-01'

In [3]:
ts = get_pricing('SPY', start_date, end_date)
ts.head()

Unnamed: 0,open_price,high,low,close_price,volume,price
2012-01-03 00:00:00+00:00,116.08,116.643,115.78,115.925,145314347.0,115.925
2012-01-04 00:00:00+00:00,115.571,116.125,115.126,116.007,105644656.0,116.007
2012-01-05 00:00:00+00:00,115.398,116.506,114.871,116.352,143200079.0,116.352
2012-01-06 00:00:00+00:00,116.478,116.497,115.653,116.107,119860034.0,116.107
2012-01-09 00:00:00+00:00,116.298,116.461,115.762,116.298,85232476.0,116.298


In [4]:
tslag = ts[['price']].copy()
tslag

Unnamed: 0,price
2012-01-03 00:00:00+00:00,115.925
2012-01-04 00:00:00+00:00,116.007
2012-01-05 00:00:00+00:00,116.352
2012-01-06 00:00:00+00:00,116.107
2012-01-09 00:00:00+00:00,116.298
2012-01-10 00:00:00+00:00,117.352
2012-01-11 00:00:00+00:00,117.424
2012-01-12 00:00:00+00:00,117.688
2012-01-13 00:00:00+00:00,117.160
2012-01-17 00:00:00+00:00,117.506


In [5]:
for i in xrange(0,lags):
    tslag["Lag_" + str(i+1)] = tslag["price"].shift(i+1)
tslag["returns"] = tslag["price"].pct_change()
tslag.head(10)    

Unnamed: 0,price,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,returns
2012-01-03 00:00:00+00:00,115.925,,,,,,
2012-01-04 00:00:00+00:00,116.007,115.925,,,,,0.000707
2012-01-05 00:00:00+00:00,116.352,116.007,115.925,,,,0.002974
2012-01-06 00:00:00+00:00,116.107,116.352,116.007,115.925,,,-0.002106
2012-01-09 00:00:00+00:00,116.298,116.107,116.352,116.007,115.925,,0.001645
2012-01-10 00:00:00+00:00,117.352,116.298,116.107,116.352,116.007,115.925,0.009063
2012-01-11 00:00:00+00:00,117.424,117.352,116.298,116.107,116.352,116.007,0.000614
2012-01-12 00:00:00+00:00,117.688,117.424,117.352,116.298,116.107,116.352,0.002248
2012-01-13 00:00:00+00:00,117.16,117.688,117.424,117.352,116.298,116.107,-0.004486
2012-01-17 00:00:00+00:00,117.506,117.16,117.688,117.424,117.352,116.298,0.002953


In [6]:
# If any of the values of percentage returns equal zero, set them to
# a small number (stops issues with QDA model in scikit-learn)
for k, v in enumerate(tslag["returns"]):
    if (abs(v) < 0.0001):
        tslag["returns"][k] = 0.0001

In [7]:
# Create the lagged percentage returns columns
for i in xrange(0, lags):
    tslag["Lag_returns_" + str(i+1)] = tslag["Lag_" + str(i+1)].pct_change()

In [8]:
tslag.head(7)

Unnamed: 0,price,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,returns,Lag_returns_1,Lag_returns_2,Lag_returns_3,Lag_returns_4,Lag_returns_5
2012-01-03 00:00:00+00:00,115.925,,,,,,,,,,,
2012-01-04 00:00:00+00:00,116.007,115.925,,,,,0.000707,,,,,
2012-01-05 00:00:00+00:00,116.352,116.007,115.925,,,,0.002974,0.000707,,,,
2012-01-06 00:00:00+00:00,116.107,116.352,116.007,115.925,,,-0.002106,0.002974,0.000707,,,
2012-01-09 00:00:00+00:00,116.298,116.107,116.352,116.007,115.925,,0.001645,-0.002106,0.002974,0.000707,,
2012-01-10 00:00:00+00:00,117.352,116.298,116.107,116.352,116.007,115.925,0.009063,0.001645,-0.002106,0.002974,0.000707,
2012-01-11 00:00:00+00:00,117.424,117.352,116.298,116.107,116.352,116.007,0.000614,0.009063,0.001645,-0.002106,0.002974,0.000707


In [9]:
# Create the "Direction" column (+1 or -1) indicating an up/down day
tslag["Direction"] = np.sign(tslag["returns"])
#tslag = tslag[tslag.index >= start_date]
tslag.dropna(inplace=True)
tslag.head()

Unnamed: 0,price,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,returns,Lag_returns_1,Lag_returns_2,Lag_returns_3,Lag_returns_4,Lag_returns_5,Direction
2012-01-11 00:00:00+00:00,117.424,117.352,116.298,116.107,116.352,116.007,0.000614,0.009063,0.001645,-0.002106,0.002974,0.000707,1.0
2012-01-12 00:00:00+00:00,117.688,117.424,117.352,116.298,116.107,116.352,0.002248,0.000614,0.009063,0.001645,-0.002106,0.002974,1.0
2012-01-13 00:00:00+00:00,117.16,117.688,117.424,117.352,116.298,116.107,-0.004486,0.002248,0.000614,0.009063,0.001645,-0.002106,-1.0
2012-01-17 00:00:00+00:00,117.506,117.16,117.688,117.424,117.352,116.298,0.002953,-0.004486,0.002248,0.000614,0.009063,0.001645,1.0
2012-01-18 00:00:00+00:00,118.823,117.506,117.16,117.688,117.424,117.352,0.011208,0.002953,-0.004486,0.002248,0.000614,0.009063,1.0


In [10]:
# Use the prior two days of returns as predictor values, with direction as the response
X = tslag[["Lag_1","Lag_2"]]
y = tslag["Direction"]

In [11]:
# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

In [12]:
# Create prediction DataFrame
pred = pd.DataFrame(index=y_test.index)

In [13]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
pred = (1.0 + y_pred * y_test)/2.0
hit_rate = np.mean(pred)
print('Logistic Regresstion {:.3f}').format(hit_rate)

Logistic Regresstion 0.523


In [14]:
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
pred = (1.0 + y_pred * y_test)/2.0
hit_rate = np.mean(pred)
print('LDA {:.3f}').format(hit_rate)

LDA 0.523


In [15]:
models = QDA()
models.fit(X_train, y_train)
y_pred = models.predict(X_test)
pred = (1.0 + y_pred * y_test)/2.0
hit_rate = np.mean(pred)
print('QDA {:.3f}').format(hit_rate)

QDA 0.570


***