In [4]:
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC

from create_lagged_series import create_lagged_series


def run_models(X_train, X_test, y_train, y_test):
    # Define the models
    models = {
        "Logistic Regression": LogisticRegression(),
        "LDA": LDA(),
        "QDA": QDA(),
        "Linear SVC": LinearSVC(),
        "SVM (RBF Kernel)": SVC(
            C=1000000.0, gamma=0.0001, kernel='rbf'
        ),
        "Random Forest": RandomForestClassifier(
            n_estimators=1000, max_features='sqrt'
        )
    }

    # Iterate through the models
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        pred = model.predict(X_test)

        # Output the hit-rate and the confusion matrix for each model
        print(f"{name}:\n{model.score(X_test, y_test):.3f}")
        print(f"{confusion_matrix(pred, y_test)}\n")


if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series(
        "^GSPC", datetime.datetime(2001, 1, 10),
        datetime.datetime(2005, 12, 31), lags=5
    )

    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.8, random_state=42
    )

    # Run models and print results
    print("Hit Rates/Confusion Matrices:\n")
    run_models(X_train, X_test, y_train, y_test)



[*********************100%%**********************]  1 of 1 completed


Hit Rates/Confusion Matrices:

Logistic Regression:
0.513
[[ 69  67]
 [420 444]]

LDA:
0.513
[[ 69  67]
 [420 444]]

QDA:
0.503
[[ 83  91]
 [406 420]]





Linear SVC:
0.513
[[ 69  67]
 [420 444]]

SVM (RBF Kernel):
0.508
[[ 13  16]
 [476 495]]

Random Forest:
0.489
[[201 223]
 [288 288]]



In [5]:

from __future__ import print_function
import datetime
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from create_lagged_series import create_lagged_series

if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series(
        "^GSPC", datetime.datetime(2001, 1, 10),
        datetime.datetime(2005, 12, 31), lags=5
    )

    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]

    # Create a k-fold cross-validation object
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Use the kf object to create index arrays that state which elements have been retained for training
    # and which elements have been retained for testing for each k-element iteration
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # In this instance only use the Radial Support Vector Machine (SVM)
        print("Hit Rate/Confusion Matrix:")
        model = SVC(
            C=1000000.0, gamma=0.0001, kernel='rbf',
            tol=0.001, verbose=False
        )

        # Train the model on the retained training data
        model.fit(X_train, y_train)

        # Make an array of predictions on the test set
        pred = model.predict(X_test)

        # Output the hit-rate and the confusion matrix for each model
        print(f"{model.score(X_test, y_test):.3f}")
        print(f"{confusion_matrix(pred, y_test)}\n")


[*********************100%%**********************]  1 of 1 completed


Hit Rate/Confusion Matrix:
0.544
[[10  7]
 [50 58]]

Hit Rate/Confusion Matrix:
0.408
[[ 2  4]
 [70 49]]

Hit Rate/Confusion Matrix:
0.528
[[ 4  4]
 [55 62]]

Hit Rate/Confusion Matrix:
0.520
[[ 4  3]
 [57 61]]

Hit Rate/Confusion Matrix:
0.512
[[ 6  4]
 [57 58]]

Hit Rate/Confusion Matrix:
0.480
[[ 7  7]
 [58 53]]

Hit Rate/Confusion Matrix:
0.624
[[ 8  7]
 [40 70]]

Hit Rate/Confusion Matrix:
0.464
[[ 7 13]
 [54 51]]

Hit Rate/Confusion Matrix:
0.560
[[ 5  4]
 [51 65]]

Hit Rate/Confusion Matrix:
0.544
[[ 5  5]
 [52 63]]



Grid Search

In [8]:
import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from create_lagged_series import create_lagged_series

if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series(
        "^GSPC", datetime.datetime(2010, 1, 10),
        datetime.datetime(2152, 12, 31), lags=5
    )

    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=42
    )

    # Set the parameters by cross-validation
    tuned_parameters = [
        {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}
    ]

    # Perform the grid search on the tuned parameters
    model = GridSearchCV(SVC(), tuned_parameters, cv=10)
    model.fit(X_train, y_train)

    print("Optimized parameters found on the training set:")
    print(model.best_estimator_, "\n")
    
    print("Grid scores calculated on training set:")
    means = model.cv_results_['mean_test_score']
    stds = model.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, model.cv_results_['params']):
        print(f"{mean:.3f} (+/-{std * 2:.03f}) for {params}")


[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['^GSPC']: ConnectionError(MaxRetryError("HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Max retries exceeded with url: /v8/finance/chart/%5EGSPC?period1=1389330000&period2=1703998800&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=K2mhbswVpvr (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f6c94542e50>: Failed to establish a new connection: [Errno 101] Network is unreachable'))"))


ValueError: With n_samples=0, test_size=0.5 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.