In [1]:
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import matplotlib.pylab as plt
from sklearn.metrics import f1_score

In [2]:
def get_prices(tickers_list, start, end, what_price):
    df = pd.DataFrame()
    for ticker in tickers_list:
        tmp = data.DataReader(ticker, 'yahoo', start, end)
        df[ticker] = tmp[what_price]
    return df 

In [None]:
# Is pct_change good enough? Maybe make it ln(x2/x1)?
#df.plot()
#plt.legend(loc='lower left')
#plt.show()
#print(np.corrcoef(X.T)) # cross-correlation careful here need to transpose "X" 


In [None]:
# DATA PREPROCESSING 
# For first try, assign SPY as dependent variable an lag 1 day returns as explanatory variables
import warnings
warnings.filterwarnings("ignore")

#######################
tickers = ['SPY', 'TLT', 'JNK', 'IYR', 'FXE', 'GSG', 'EEM', 'IWM', 'HYG', 'UNG', 'LQD']
st = dt.datetime(2006, 1, 1)
ed = dt.datetime(2016, 1, 1)
prior_days_ago    = 1    # nos of days ago -- pct_change ago
next_days_forward = [1, 5, 10, 20, 40, 60, 90] # nos of days forward -- pct_change
#######################

df = get_prices(tickers, st, ed, 'Adj Close')
in_ = True

for idx, tick in enumerate(tickers):
    
    for n_fwd in next_days_forward:
        
        prior_n = n_fwd
        
        X = df[ tickers ].pct_change(prior_n).fillna(0) # 1 day ago returns, 2-days ago return
        
        ## add dummies for days of the week ##
        X['wk_day'] = X.index.weekday_name;   X = pd.get_dummies(X)
        X['month']  = X.index.strftime('%b');   X = pd.get_dummies(X)
        ## add dummies for days of the week ##
        
        y = df[tick].shift(-n_fwd).fillna(method='ffill') / df[tick].shift(0).fillna(method='ffill') - 1
        y = np.where(y>0, 1, 0) # dummy 1 if return > 0, else 0

        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                    train_size = 0.7, random_state = 0)

        sc = StandardScaler()

        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        ### Classifier ### 
        mlp = MLPClassifier()

        #param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
        param_range = [0.01, 10.0]

        param_grid = [{'alpha'     : param_range, # regularization strength on L2
                       'activation': ['relu'], #,'logistic', 'tanh'],
                       'solver'    : ['sgd'],
                       'hidden_layer_sizes' : [(100,100), (100,100,100)]}] # 'lbfgs','adam'

        gs = GridSearchCV(estimator = mlp,
                          param_grid = param_grid,
                          scoring='accuracy',
                          cv = 3,
                          n_jobs = -1)

        gs = gs.fit(X_train, y_train)

    #    print(gs.best_score_)
    #    print(gs.best_params_)

        best_mlp = gs.best_estimator_
        best_mlp.fit(X_train, y_train)

        train_score = best_mlp.score(X_train, y_train)
        test_score  = best_mlp.score(X_test, y_test)

        train_score = f1_score(y_train, best_mlp.predict(X_train) ) 
        test_score  = f1_score(y_test , best_mlp.predict(X_test)  )
        
        
        if in_:
            print('Ticker, Base%, TrainScore, TestScore, prior_days_ago, next_days_forward ')
            print(tick,',%.3f, %.3f, %.3f, %.f, %.f' % (y.sum()/len(y), train_score, test_score, prior_n, n_fwd)) 
            in_ = False
        else:
            print(tick,',%.3f, %.3f, %.3f, %.f, %.f' % (y.sum()/len(y), train_score, test_score, prior_n, n_fwd))        


Ticker, Base%, TrainScore, TestScore, prior_days_ago, next_days_forward 
SPY ,0.548, 0.553, 0.541, 1, 1
SPY ,0.578, 0.652, 0.593, 5, 5
SPY ,0.611, 0.675, 0.628, 10, 10
SPY ,0.646, 0.725, 0.688, 20, 20
SPY ,0.670, 0.774, 0.734, 40, 40
SPY ,0.684, 0.862, 0.783, 60, 60
SPY ,0.698, 0.846, 0.820, 90, 90
TLT ,0.524, 0.518, 0.537, 1, 1
TLT ,0.540, 0.652, 0.574, 5, 5
TLT ,0.539, 0.674, 0.607, 10, 10
TLT ,0.543, 0.702, 0.656, 20, 20
TLT ,0.597, 0.776, 0.697, 40, 40
TLT ,0.600, 0.810, 0.720, 60, 60
TLT ,0.600, 0.842, 0.813, 90, 90
JNK ,0.419, 0.587, 0.569, 1, 1
JNK ,0.470, 0.652, 0.577, 5, 5
JNK ,0.484, 0.684, 0.615, 10, 10
JNK ,0.493, 0.734, 0.664, 20, 20
JNK ,0.520, 0.872, 0.815, 40, 40
JNK ,0.539, 0.918, 0.873, 60, 60
JNK ,0.528, 0.943, 0.901, 90, 90
IYR ,0.522, 0.547, 0.493, 1, 1
IYR ,0.566, 0.639, 0.582, 5, 5
IYR ,0.577, 0.671, 0.615, 10, 10
IYR ,0.608, 0.735, 0.684, 20, 20
IYR ,0.614, 0.808, 0.762, 40, 40
IYR ,0.647, 0.823, 0.782, 60, 60
IYR ,0.674, 0.908, 0.870, 90, 90
FXE ,0.504, 0.508, 

In [None]:
X.head()

In [None]:
X['month'] = X.index.strftime('%b')
X = pd.get_dummies(X)
    
print(X.head())

In [None]:
import warnings
warnings.filterwarnings("ignore")

mlp = MLPClassifier()

#param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_range = [0.01, 10.0]


param_grid = [{'alpha'     : param_range, # regularization strength on L2
           'activation': ['relu'], #,'logistic', 'tanh'],
           'solver'    : ['sgd'],
           'hidden_layer_sizes' : [(100,100), (100,100,100)]}] # 'lbfgs','adam'

gs = GridSearchCV(estimator = mlp,
              param_grid = param_grid,
              scoring='accuracy',
              cv = 2,
              n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

best_mlp = gs.best_estimator_
best_mlp.fit(X_train, y_train)

print('Train accuracy: %.3f' % best_mlp.score(X_train, y_train))
print('Test  accuracy: %.3f' % best_mlp.score(X_test, y_test))

In [None]:
    qqq

In [None]:
adsf


In [None]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt

% matplotlib inline

def get_prices(ticks, st, ed):
# This function gets Adjusted Closing prices from Yahoo Finance
# returns a DataFrame. Inputs are ticks (list of tickers), st (start date), ed (end date)
    for idx, ticker in enumerate(ticks):
        print(ticker)
        f = web.DataReader(ticker, 'yahoo', st, ed)['Adj Close']
        f.name = ticker
        if idx==0:
            df = f
        else:
            df = pd.concat([df, f], axis=1)
        return df


In [None]:
Jimtickers = [['SPY', 'AAPL','TLT','GLD']]

start = dt.datetime(2010, 1, 1)
end   = dt.datetime.today()

df = get_prices(tickers, start, end)
print(df.shape)

In [None]:
#print(df.head())
X = df[['AAPL', 'TLT','GLD','SPY']].pct_change().fillna(0)

#print('Cross Correlation')
#print(np.corrcoef(X.T)) # careful here need to transpose "X" 

print(X.head())
# pull back prices by one day, fill NA forward, 1-day ahead returns
y = df['SPY'].shift(-2).fillna(method='ffill') - df['SPY'].shift(-1).fillna(method='ffill') 

print(df['SPY'].tail())
print(y.tail())
    
y = np.where(y>0, 1, 0) # dummy 1 if return > 0, else 0
print('Positive %.3f, negative %.3f' % (y.sum()/len(y), 1-y.sum()/len(y)))

print(y.shape)
print(type(X), type(y))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                train_size = 0.7, random_state = 0, 
                stratify = y)

print('Positive %.3f, negative %.3f' % (y_train.sum()/len(y_train), 
                                        1-y_train.sum()/len(y_train)))

print(type(X_train), type(y_train))

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=30, alpha=1e-4,
                    solver='sgd', verbose=False, tol=1e-5, random_state=1,
                    learning_rate_init=.1, activation='logistic') #tanh')

mlp.fit(X_train, y_train)

print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pipe_lr = Pipeline([('scl', StandardScaler()),
              #      ('pca', PCA(n_components=0.75)),
                    ('clf', mlp)])
pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % (pipe_lr.score(X_test, y_test)))


In [None]:
from sklearn.model_selection import cross_val_score # doees stratified k-fold cross validation 

scores = cross_val_score(estimator = pipe_lr,
                        X=X_train, 
                        y=y_train,
                        cv = 10)

print('k-fold stratified cross validation accuracy scores: %s' %scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
# find best hyper-parameters for Deep Learning Neural Network
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV

pipe_mlp = Pipeline([('scl', StandardScaler()),
                     ('clf', mlp)])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]


param_grid = [{'clf__alpha'     : param_range, # regularization strength on L2
               'clf__activation': ['relu','logistic', 'tanh'],
               'clf__solver'    : ['sgd'],
               'clf__hidden_layer_sizes' : [(100,100), (100,100,100)]}] # 'lbfgs','adam'
    
gs = GridSearchCV(estimator = pipe_mlp,
                  param_grid = param_grid,
                  scoring='accuracy',
                  cv = 2,
                  n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

best_mlp = gs.best_estimator_
best_mlp.fit(X_train, y_train)
print('Test accuracy: %.6f' % best_mlp.score(X_test, y_test))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, criterion='gini')

rf.fit(X_train, y_train)

print("Training set score: %f" % rf.score(X_train, y_train))
print("Test set score: %f" % rf.score(X_test, y_test))


pipe_rf = Pipeline([    #('scl', StandardScaler()),
                     ('clf', rf)])

param_range = [100,200,300]

param_grid = [{'clf__n_estimators' : [100,200,300], # nos of trees in the forest
               'clf__criterion': ['gini', 'entropy']}]
    
gs = GridSearchCV(estimator = pipe_rf,
                  param_grid = param_grid,
                  scoring='accuracy',
                  cv = 2,
                  n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

best_rf = gs.best_estimator_
best_rf.fit(X_train, y_train)

print('Test accuracy: %.3f' % best_rf.score(X_test, y_test))



In [None]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel='rbf') # ‘linear’ or 'rbf'
        
svc.fit(X_train, y_train)

print("Training set score: %f" % svc.score(X_train, y_train))
print("Test set score: %f" % svc.score(X_test, y_test))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
print(type(X), type(y))
print(X.shape, y.shape)



In [None]:
h = .02  # step size in the mesh

names = ["Perceptron",
         "LogisticRegression",
         "Linear SVM", 
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "RBF SVM",          
         "Neural Net", 
         "Naive Bayes",  
         "Nearest Neighbors"] 

classifiers = [
    Perceptron(),
    LogisticRegression(),
    SVC(kernel="linear", C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    SVC(kernel='rbf', gamma=2, C=1),
    MLPClassifier(hidden_layer_sizes=(100,1000), alpha=1),
    GaussianNB(),
    KNeighborsClassifier(5)]

datasets = [(X, y)]             

figure = plt.figure(figsize=(30, 10))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    XX, yy = ds
    XX = StandardScaler().fit_transform(XX)
    X_train, X_test, y_train, y_test = \
        train_test_split(XX, yy, test_size=.3, random_state=42)

    x_min, x_max = XX[:, 0].min() - .5, XX[:, 0].max() + .5
    y_min, y_max = XX[:, 1].min() - .5, XX[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.3f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1



In [None]:
# Deep learning has caught the attention and fear of many investors. Applications of using neural networks
# has be employed to successfully best humans in chess, Jeporedy, and recently Go. Does this mean that 
# it could be possible to create an artificial intelligent machine to succesfully trade against human?
# In this study we attempt to gain an understanding of how powerful the popular machine learning algos are
# that are readily available to even the lay person. 
# We ask the question, can we

# Neural networks are built from Single Layer Perceptrons by stacking the perceptrons. In our model we use a two hidden 
# neural network also know as a deep learning neural network. We impose L2 regularization to mitigate high variance
# in our model. 

# Table
# Algorithm, 10-k cross validation score +/- std, best hyper-parameters, and regularization (L1, L2, or Elastic Net)
# 

In [None]:
clf


In [None]:
type(X)

In [None]:
sum(y)

In [None]:
len(y)

In [None]:
# 1) What is Machine Learning? 

# 2) Label each below with either a "S" supervised learning algo, "U" unsupervised learning algo or "Neither" 
AdaBoost # "S" Classifier(), 
Decision Tree # "S" Classifier(max_depth=5),
Density-based Spatial Clustering of Applications with Noise (DBSCAN) , # Unsupervised-Classifier
Flask # "Neither" 
IRIS # "Neither" 
K-Fold Cross Validation # "Neither" 
K-Nearest Neighbors # "S" Classifier(5)]
K-means++ # Unsupervised Classifier 
Linear Discriminant Analysis # "S" supervised 
Linear Regression # "S" Regression 
Logistic Regression # "S" Classifier (),
Multi-Layer Perceptron # "S" Classifier(hidden_layer_sizes=(100,1000), alpha=1), and Regression
MNIST # "Neither"
Normalization # "Neither"
Pandas # "Neither" 
Perceptron # "S", Classifier (),
Principal Component Analysis # "U"
Random Forest # "S", Classifier(n_estimators=100), Regressor
RANdom SAmple Consensus (RANSAC) # "S",  Supervised Regression
SciKit-Learn  # "Neither" 
Standardization # "Neither" 
Stochastic Gradient Descent # "Neither"
Support Vector Machine # "S", Classifier 

# 3) 
Match the graphs below

# 4) Given the Confusion Matrix compute:
# precision, recall, and f1-score = 2*(PRExREC)/(PRE+REC) 

# 5) Write the whole equations for regularized linear regressions:
# a) Ridge Regression = 
# b) Least Absolute Shrinkage and Selection Ooperator
    