In [3]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 321 kB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 2.4 MB/s eta 0:00:01
[?25hInstalling collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.3.2 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
from __future__ import print_function
import datetime
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

def create_lagged_series(symbol, start_date, end_date, lags=5):
    """
    This creates a Pandas DataFrame that stores the percentage returns of the adjusted closing value of
    a stock obtained from Yahoo Finance, along with a number of lagged returns from the prior trading days
    (lags defaults to 5 days). Trading volume, as well as the Direction from the previous day, are also included.
    """
    # Obtain stock information from Yahoo Finance
    ts = yf.download(symbol, start=start_date - datetime.timedelta(days=365), end=end_date)
    
    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]
    
    # Create the shifted lag series of prior trading period close values
    for i in range(0, lags):
        tslag[f"Lag{i+1}"] = ts["Adj Close"].shift(i+1)
    
    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change() * 100.0
    
    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in Scikit-Learn)
    tsret["Today"].replace(to_replace=0, value=0.0001, inplace=True)
    
    # Create the lagged percentage returns columns
    for i in range(0, lags):
        tsret[f"Lag{i+1}"] = tslag[f"Lag{i+1}"].pct_change() * 100.0
    
    # Create the "Direction" column (+1 or -1) indicating an up/down day
    tsret["Direction"] = np.sign(tsret["Today"])
    tsret = tsret[tsret.index >= start_date]
    
    return tsret

if __name__ == "__main__":
    # Create a lagged series of the S&P 500 US stock market index
    snpret = create_lagged_series(
        "^GSPC", datetime.datetime(2001, 1, 10),
        datetime.datetime(2005, 12, 31), lags=5
    )
    
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]
    
    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2005, 1, 1)
    
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]
    
    # Create the (parametrized) models
    print("Hit Rates/Confusion Matrices:\n")
    models = [
        ("LR", LogisticRegression(max_iter=1000)),
        ("LDA", LDA()),
        ("QDA", QDA()),
        ("LSVC", LinearSVC(max_iter=10000)),
        ("RSVM", SVC(
            C=1000000.0, cache_size=200, coef0=0.0,
            degree=3, gamma=0.0001, kernel='rbf',
            max_iter=-1, probability=False, shrinking=True, tol=0.001, verbose=False)
        ),
        ("RF", RandomForestClassifier(
            n_estimators=1000, criterion='gini', max_depth=None, 
            min_samples_split=2, min_samples_leaf=1, max_features='sqrt',  # Updated max_features
            bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0)
        )
    ]
    
    # Iterate through the models
    for name, model in models:
        # Train each of the models on the training set
        model.fit(X_train, y_train)
        
        # Make an array of predictions on the test set
        pred = model.predict(X_test)
        
        # Output the hit-rate and the confusion matrix for each model
        print(f"{name}:\n{model.score(X_test, y_test):.3f}")
        print(f"{confusion_matrix(pred, y_test)}\n")


[*********************100%%**********************]  1 of 1 completed


Hit Rates/Confusion Matrices:

LR:
0.560
[[ 35  35]
 [ 76 106]]

LDA:
0.560
[[ 35  35]
 [ 76 106]]

QDA:
0.599
[[ 30  20]
 [ 81 121]]

LSVC:
0.560
[[ 35  35]
 [ 76 106]]

RSVM:
0.567
[[ 10   8]
 [101 133]]

RF:
0.508
[[49 62]
 [62 79]]

