In [25]:
%matplotlib inline

import common
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from datetime import datetime
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

In [3]:
DJIA = pd.read_csv('../data/DowJonesIA.csv')
prices = common.load_quandl_absolute_prices(DJIA, datetime(2000, 1, 1))

Loading prices from cache


In [4]:
def weighted_ma(prices):
    n = len(prices)
    coeff = np.arange(1, n + 1)
    weighted = prices * coeff
    return weighted.sum() / coeff.sum()

def momentum(prices):
    return (prices[-1] - prices[0]) / prices[0]

def bollinger(prices):
    m = prices.mean()
    s = prices.std()
    return (prices[-1] - m) / s

def RSI(series):
    delta = series.diff()
    ups, dns = delta.copy(), delta.copy()
    ups.loc[ups < 0] = 0
    dns.loc[ups > 0] = 0
    rs = pd.rolling_mean(ups, 10).values + pd.rolling_mean(dns, 10).values
    rsi = 100 - 100 / (1 + rs)
    return rsi

def stochastic(prices):
    low = prices.min()
    high = prices.max()
    return (prices[-1] - low) / (high - low)

Items in rolling apply are in the same order they are in the source series

In [5]:
dataset = pd.DataFrame()

for ticker in prices['Ticker'].unique():
    ticker_prices = prices[prices['Ticker'] == ticker] 

    ticker_prices.loc[:,'wma'] = pd.rolling_apply(ticker_prices['Close'], 10, weighted_ma)
    ticker_prices.loc[:,'mom'] = pd.rolling_apply(ticker_prices['Close'], 10, momentum)
    ticker_prices.loc[:,'bol'] = pd.rolling_apply(ticker_prices['Close'], 10, bollinger)  
    ticker_prices.loc[:,'rsi'] = RSI(ticker_prices['Close'])
    
    sto_k = pd.rolling_apply(ticker_prices['Close'], 10, stochastic) 
    sto_d = pd.rolling_mean(sto_k, 3)
    ticker_prices.loc[:,'sto'] = sto_k - sto_d    
        
    features = ['wma', 'mom', 'bol', 'rsi', 'sto']
    ticker_data = ticker_prices[features]
    ticker_data = (ticker_data - ticker_data.mean()) / ticker_data.std()

    # dependent variables
    ticker_prices.loc[:, 'y'] = common.rdiff(ticker_prices.shift(-1)['Close'], 2)
    ticker_prices.loc[:, 'yClass'] = 'Neutral'
    ticker_data.loc[ticker_prices['y'] > 0, 'yClass'] = 'Gain'
    ticker_data.loc[ticker_prices['y'] < 0, 'yClass'] = 'Loss'
    
    dataset = dataset.append(ticker_data)

Try using .loc[row_index,col_indexer] = value instead
  self.obj[item] = s


In [6]:
dataset = dataset.dropna()

In [8]:
len(dataset)

95407

# Error Estimation

## LogisticRegression 

In [12]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('classifier', LogisticRegression())
])

hyperparams = {
    'classifier__penalty': ['l1', 'l2'],
}

gs = GridSearchCV(pipe, hyperparams)
scores = cross_val_score(gs, X, y, cv = 10)
scores.mean()

0.51438575917638474

0.51439624355396996

## GaussianNB

In [23]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('selector', SelectKBest(f_classif)),
    ('classifier', GaussianNB())
])

hyperparams = {
    'selector__k': np.arange(1, X.shape[1]),
}

gs = GridSearchCV(pipe, hyperparams, verbose=2)
scores = cross_val_score(gs, X, y)
scores.mean()

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=3 ...................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    3.6s finished



Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=3 ...................................................
[CV] ...........

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    3.7s finished



Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=1 ...................................................
[CV] .......................................... selector__k=1 -   0.2s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=2 ...................................................
[CV] .......................................... selector__k=2 -   0.3s
[CV] selector__k=3 ...................................................
[CV] ...........

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    3.7s finished





0.51314893850442778

## kNN

In [19]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('classifier', KNeighborsClassifier(3, weights='distance'))
])

hyperparams = {
    'classifier__n_neighbors': [6, 8, 10, 12, 14],
    'classifier__weights': ['uniform', 'distance']
}

# gs = GridSearchCV(pipe, hyperparams, verbose=10)
scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.50347442143660248

## SVM

In [None]:
X = dataset.drop(['yClass'], axis=1)
y = dataset['yClass']

pipe = Pipeline([
    ('classifier', LinearSVC())
])

hyperparams = {
    'classifier__n_neighbors': [6, 8, 10, 12, 14],
    'classifier__weights': ['uniform', 'distance']
}

# gs = GridSearchCV(pipe, hyperparams, verbose=10)
scores = cross_val_score(pipe, X, y)
scores.mean()