In [2]:
%matplotlib inline

# import talib
import common
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from datetime import datetime
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

In [3]:
DJIA = pd.read_csv('../data/DowJonesIA.csv')

In [4]:
prices = common.load_quandl_absolute_prices(DJIA, datetime(2000, 1, 1))

Loading prices from cache


In [4]:
def sign(num):
    if num < 0:
        return -1
    if num == 0:
        return 0
    if num > 0:
        return 1

def crossover(df, col1, col2):
    def find_crossover(vals):
        if sign(vals[0]) == sign(vals[1]):
            return 0
        else:
            return sign(vals[0])
    
    diff = df[col2] - df[col1]
    return pd.rolling_apply(diff, 2, find_crossover)

In [5]:
def kappa(cm):
    num_classes = len(cm)
    sum_all = 0
    sum_diag = 0
    sum_rands = 0
    for i in range(0, num_classes):
        sum_diag = sum_diag + cm[i, i]
        sum_col = 0
        sum_row = 0
        for j in range(0, num_classes):
            sum_col = sum_col + cm[j, i]
            sum_row = sum_row + cm[i, j]
            sum_all = sum_all + cm[i, j]
        sum_rands = sum_rands + sum_row * sum_col
    acc = sum_diag * 1.0 / sum_all
    rand = sum_rands * 1.0 / (sum_all * sum_all)
    return (acc - rand) / (1 - rand)

kappa_scorer = make_scorer(lambda y_true, y_pred: kappa(confusion_matrix(y_true, y_pred)), greater_is_better=True)

In [6]:
dataset = pd.DataFrame()

def augmented_crossover(df, col1, col2):
    def find_crossover(vals):
        if sign(vals[0]) == sign(vals[1]):
            return 0
        else:
            return vals[0] - vals[1]
    
    diff = df[col2] - df[col1]
    return pd.rolling_apply(diff, 2, find_crossover)

for ticker in prices['Ticker'].unique():
    ticker_prices = prices[prices['Ticker'] == ticker] 
        
    #Moving averages(1, 51):
    for term in range(1, 51):
        ticker_prices[term] = pd.rolling_mean(ticker_prices['Close'], term)
#     ticker_prices['Crossover'] = crossover(ticker_prices, '10MA', '20MA')
    #ticker_prices['momentum'] = talib.MOM(ticker_prices['Close'].values)
    
    #dependent variables
    ticker_prices['y'] = common.rdiff(ticker_prices.shift(-1)['Close'], 2)
    ticker_prices['yClass'] = 'Neutral'
    ticker_prices.loc[ticker_prices['y'] > 0, 'yClass'] = 'Gain'
    ticker_prices.loc[ticker_prices['y'] < 0, 'yClass'] = 'Loss'
    
    dataset = dataset.append(ticker_prices)

Try using .loc[row_index,col_indexer] = value instead
  self.obj[item] = s


In [None]:
sample = dataset[:80]
sample['n'] = np.arange(len(sample))

plt.plot(sample['Close'], label='Price', linewidth=2)

shortMA = sample[10]
longMA = sample[20]
plt.fill_between(np.arange(len(sample)), shortMA, longMA, where=shortMA>=longMA, facecolor='green', alpha=0.3)
plt.fill_between(np.arange(len(sample)), shortMA, longMA, where=shortMA<longMA, facecolor='red', alpha=0.3)
plt.plot(shortMA, color='g', label='Short Moving Average')
plt.plot(longMA, color='r', label='Long Moving Average')
plt.legend()
    
plt.twinx().plot(sample['Crossover'])    
    
plt.ylabel('Price')    
plt.xlabel('Time')
plt.title('Plot displaying elements of a moving average crossover strategy')

Should decide 
* cross val methodology
* score metric
* features

# Simple Predictor

## Split data into train and test sets by company

In [7]:
train_tickers = np.random.choice(dataset['Ticker'].unique(), 20, replace = False)
test_tickers = list(set(dataset['Ticker'].unique()) - set(train_tickers))

train_dataset = dataset[dataset['Ticker'].isin(train_tickers)]
test_dataset = dataset[dataset['Ticker'].isin(test_tickers)]

In [10]:
len(test_dataset)

38315

Can use NaN zones between tickers as buffer zones for the crossover points

In [None]:
best_score    = 0
best_short_ma = 0
best_long_ma  = 0

for short_ma in range(1, 51):
    print 'short_ma: ', short_ma
    for long_ma in range(short_ma+1, 51):
        train_dataset['Crossover'] = crossover(train_dataset, short_ma, long_ma)
        train_dataset.dropna(subset=['Crossover'], inplace=True)

        sample = train_dataset[train_dataset['Crossover'] != 0]
        sample.loc[sample['Crossover'] == 1, 'y_pred'] = 'Gain'
        sample.loc[sample['Crossover'] == -1, 'y_pred'] = 'Loss'

        score = accuracy_score(sample['yClass'], sample['y_pred'].astype(str))  
            
        if score > best_score:
            best_score = score
            best_short_ma = short_ma
            best_long_ma = long_ma
            print best_score, best_short_ma, best_long_ma
        

Results

In [27]:
"""
0.501616031028 2 36
0.507306434024 2 37
0.51274581209 8 33
0.521982116244 8 34
0.522935779817 8 35
"""

'\n0.501616031028 2 36\n0.507306434024 2 37\n0.51274581209 8 33\n0.521982116244 8 34\n0.522935779817 8 35\n'

# Cross Validation

In [20]:
best_pair = (8, 35)

short_ma, long_ma = best_pair

test_dataset['Crossover'] = crossover(test_dataset, short_ma, long_ma)
test_dataset.dropna(subset=['Crossover'], inplace=True)

sample = test_dataset[test_dataset['Crossover'] != 0]
sample.loc[sample['Crossover'] == 1, 'y_pred'] = 'Gain'
sample.loc[sample['Crossover'] == -1, 'y_pred'] = 'Loss'

score = accuracy_score(sample['yClass'], sample['y_pred'].astype(str))              
print score



print kappa(confusion_matrix(sample['yClass'], sample['y_pred'].astype(str), labels=['Gain', 'Loss']))

0.515759312321
0.0427171433815


In [19]:
(363+333) / 1381.0


0.503982621288921