In [1]:
%matplotlib inline

import common
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import pand

from datetime import datetime
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

In [2]:
DJIA = pd.read_csv('../data/DowJonesIA.csv')
prices = common.load_quandl_absolute_prices(DJIA, datetime(2000, 1, 1))

Loading prices from cache


In [45]:
def weighted_ma(prices):
    n = len(prices)
    coeff = np.arange(1, n + 1)
    weighted = prices * coeff
    return weighted.sum() / coeff.sum()

def momentum(prices):
    return (prices[-1] - prices[0]) / prices[0]

def bollinger(prices):
    m = prices.mean()
    s = prices.std()
    return (prices[-1] - m) / s

def RSI(series):
    delta = series.diff()
    ups, dns = delta.copy(), delta.copy()
    ups.loc[ups < 0] = 0
    dns.loc[ups > 0] = 0
    rs = pd.rolling_mean(ups, 10).values + pd.rolling_mean(dns, 10).values
    rsi = 100 - 100 / (1 + rs)
    return rsi

def stochastic(prices):
    low = prices.min()
    high = prices.max()
    return (prices[-1] - low) / (high - low)

Items in rolling apply are in the same order they are in the source series

In [47]:
dataset = pd.DataFrame()

for ticker in prices['Ticker'].unique():
    ticker_prices = prices[prices['Ticker'] == ticker] 

    ticker_prices.loc[:,'wma'] = pd.rolling_apply(ticker_prices['Close'], 10, weighted_ma)
    ticker_prices.loc[:,'mom'] = pd.rolling_apply(ticker_prices['Close'], 10, momentum)
    ticker_prices.loc[:,'bol'] = pd.rolling_apply(ticker_prices['Close'], 10, bollinger)  
    ticker_prices.loc[:,'rsi'] = RSI(ticker_prices['Close'])
    
    sto_k = pd.rolling_apply(ticker_prices['Close'], 10, stochastic) 
    sto_d = pd.rolling_mean(sto_k, 3)
    ticker_prices.loc[:,'sto'] = sto_k - sto_d    
        
    features = ['wma', 'mom', 'bol', 'rsi', 'sto']
    ticker_data = ticker_prices[features]
    ticker_data = (ticker_data - ticker_data.mean()) / ticker_data.std()

    # dependent variables
    ticker_prices.loc[:, 'y'] = common.rdiff(ticker_prices.shift(-1)['Close'], 2)
    ticker_prices.loc[:, 'yClass'] = 'Neutral'
    ticker_data.loc[ticker_prices['y'] > 0, 'yClass'] = 'Gain'
    ticker_data.loc[ticker_prices['y'] < 0, 'yClass'] = 'Loss'
    
    dataset = dataset.append(ticker_data)

In [52]:
dataset.count()

wma       112178
mom       112178
bol       112178
rsi        96861
yClass    110824
dtype: int64

In [66]:
dataset = dataset.dropna()

# Error Estimation

## LogisticRegression 

In [73]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('classifier', LogisticRegression())
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.51520531596382058

## GaussianNB

In [70]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('selector', SelectKBest(f_classif)),
    ('classifier', GaussianNB())
])

hyperparams = {
    'selector__k': np.arange(1, X.shape[1]),
}

gs = GridSearchCV(pipe, hyperparams, cv = 10)
scores = cross_val_score(gs, X, y, cv = 10)
scores.mean()

0.51343296619428802