In [1]:
#BORIS CHANGE
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt

% matplotlib inline

def get_prices(ticks, st, ed):
# This function gets Adjusted Closing prices from Yahoo Finance
# returns a DataFrame. Inputs are ticks (list of tickers), st (start date), ed (end date)
    for idx, ticker in enumerate(ticks):
        print(ticker)
        f = web.DataReader(ticker, 'yahoo', st, ed)['Adj Close']
        f.name = ticker
        if idx==0:
            df = f
        else:
            df = pd.concat([df, f], axis=1)
        return df


In [2]:
tickers = [['SPY', 'AAPL','TLT','GLD']]

start = dt.datetime(2010, 1, 1)
end   = dt.datetime.today()

df = get_prices(tickers, start, end)
print(df.shape)

['SPY', 'AAPL', 'TLT', 'GLD']
(1741, 4)


In [3]:
#print(df.head())
X = df[['AAPL', 'TLT','GLD','SPY']].pct_change().fillna(0)

#print('Cross Correlation')
#print(np.corrcoef(X.T)) # careful here need to transpose "X" 

print(X.head())
# pull back prices by one day, fill NA forward, 1-day ahead returns
y = df['SPY'].shift(-2).fillna(method='ffill') - df['SPY'].shift(-1).fillna(method='ffill') 

print(df['SPY'].tail())
print(y.tail())
    
y = np.where(y>0, 1, 0) # dummy 1 if return > 0, else 0
print('Positive %.3f, negative %.3f' % (y.sum()/len(y), 1-y.sum()/len(y)))

print(y.shape)
print(type(X), type(y))

                AAPL       TLT       GLD       SPY
Date                                              
2010-01-04  0.000000  0.000000  0.000000  0.000000
2010-01-05  0.001729  0.006458 -0.000911  0.002647
2010-01-06 -0.015906 -0.013386  0.016500  0.000704
2010-01-07 -0.001849  0.001682 -0.006188  0.004221
2010-01-08  0.006648 -0.000448  0.004963  0.003328
Date
2016-11-23    220.699997
2016-11-25    221.520004
2016-11-28    220.479996
2016-11-29    220.910004
2016-11-30    220.380005
Name: SPY, dtype: float64
Date
2016-11-23   -1.040008
2016-11-25    0.430008
2016-11-28   -0.529999
2016-11-29    0.000000
2016-11-30    0.000000
Name: SPY, dtype: float64
Positive 0.550, negative 0.450
(1741,)
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                train_size = 0.7, random_state = 0, 
                stratify = y)

print('Positive %.3f, negative %.3f' % (y_train.sum()/len(y_train), 
                                        1-y_train.sum()/len(y_train)))

print(type(X_train), type(y_train))

Positive 0.550, negative 0.450
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


In [10]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=30, alpha=1e-4,
                    solver='sgd', verbose=False, tol=1e-5, random_state=1,
                    learning_rate_init=.1, activation='logistic') #tanh')

mlp.fit(X_train, y_train)

print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))


Training set score: 0.550082
Test set score: 0.550669


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pipe_lr = Pipeline([('scl', StandardScaler()),
              #      ('pca', PCA(n_components=0.75)),
                    ('clf', mlp)])
pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % (pipe_lr.score(X_test, y_test)))


Test Accuracy: 0.551


In [12]:
from sklearn.model_selection import cross_val_score # doees stratified k-fold cross validation 

scores = cross_val_score(estimator = pipe_lr,
                        X=X_train, 
                        y=y_train,
                        cv = 10)

print('k-fold stratified cross validation accuracy scores: %s' %scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

k-fold stratified cross validation accuracy scores: [ 0.54918033  0.54918033  0.54918033  0.54918033  0.54918033  0.54918033
  0.54918033  0.54918033  0.55371901  0.55371901]
CV accuracy: 0.550 +/- 0.002


In [13]:
# find best hyper-parameters for Deep Learning Neural Network
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV

pipe_mlp = Pipeline([('scl', StandardScaler()),
                     ('clf', mlp)])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]


param_grid = [{'clf__alpha'     : param_range, # regularization strength on L2
               'clf__activation': ['relu','logistic', 'tanh'],
               'clf__solver'    : ['sgd'],
               'clf__hidden_layer_sizes' : [(100,100), (100,100,100)]}] # 'lbfgs','adam'
    
gs = GridSearchCV(estimator = pipe_mlp,
                  param_grid = param_grid,
                  scoring='accuracy',
                  cv = 2,
                  n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

best_mlp = gs.best_estimator_
best_mlp.fit(X_train, y_train)
print('Test accuracy: %.3f' % best_mlp.score(X_test, y_test))


0.550082101806
{'clf__activation': 'relu', 'clf__solver': 'sgd', 'clf__hidden_layer_sizes': (100, 100), 'clf__alpha': 1.0}
Test accuracy: 0.551


In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, criterion='gini')

rf.fit(X_train, y_train)

print("Training set score: %f" % rf.score(X_train, y_train))
print("Test set score: %f" % rf.score(X_test, y_test))


pipe_rf = Pipeline([    #('scl', StandardScaler()),
                     ('clf', rf)])

param_range = [100,200,300]

param_grid = [{'clf__n_estimators' : [100,200,300], # nos of trees in the forest
               'clf__criterion': ['gini', 'entropy']}]
    
gs = GridSearchCV(estimator = pipe_rf,
                  param_grid = param_grid,
                  scoring='accuracy',
                  cv = 2,
                  n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

best_rf = gs.best_estimator_
best_rf.fit(X_train, y_train)
print('Test accuracy: %.3f' % best_rf.score(X_test, y_test))



Training set score: 1.000000
Test set score: 0.500956
0.50328407225
{'clf__n_estimators': 100, 'clf__criterion': 'gini'}
Test accuracy: 0.520


In [None]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel='rbf') # ‘linear’ or 'rbf'
        
svc.fit(X_train, y_train)

print("Training set score: %f" % svc.score(X_train, y_train))
print("Test set score: %f" % svc.score(X_test, y_test))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
print(type(X), type(y))
print(X.shape, y.shape)



In [None]:
h = .02  # step size in the mesh

names = ["Perceptron",
         "LogisticRegression",
         "Linear SVM", 
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "RBF SVM",          
         "Neural Net", 
         "Naive Bayes",  
         "Nearest Neighbors"] 

classifiers = [
    Perceptron(),
    LogisticRegression(),
    SVC(kernel="linear", C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    SVC(kernel='rbf', gamma=2, C=1),
    MLPClassifier(hidden_layer_sizes=(100,1000), alpha=1),
    GaussianNB(),
    KNeighborsClassifier(5)]

datasets = [(X, y)]             

figure = plt.figure(figsize=(30, 10))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    XX, yy = ds
    XX = StandardScaler().fit_transform(XX)
    X_train, X_test, y_train, y_test = \
        train_test_split(XX, yy, test_size=.3, random_state=42)

    x_min, x_max = XX[:, 0].min() - .5, XX[:, 0].max() + .5
    y_min, y_max = XX[:, 1].min() - .5, XX[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.3f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1



In [None]:
# Deep learning has caught the attention and fear of many investors. Applications of using neural networks
# has be employed to successfully best humans in chess, Jeporedy, and recently Go. Does this mean that 
# it could be possible to create an artificial intelligent machine to succesfully trade against human?
# In this study we attempt to gain an understanding of how powerful the popular machine learning algos are
# that are readily available to even the lay person. 
# We ask the question, can we

# Neural networks are built from Single Layer Perceptrons by stacking the perceptrons. In our model we use a two hidden 
# neural network also know as a deep learning neural network. We impose L2 regularization to mitigate high variance
# in our model. 

# Table
# Algorithm, 10-k cross validation score +/- std, best hyper-parameters, and regularization (L1, L2, or Elastic Net)
# 

In [None]:
clf


In [None]:
type(X)

In [None]:
sum(y)

In [None]:
len(y)