# SPY daily change prediction with XGBoost Classifier
by Harry Ho

This project will first use logistic regression to find out which predictors works in predicting SPY(SPDR S&P 500 ETF Trust) weekly change, then use XGBoost Classifier to find out whether SPY will go up or go down with predictors that are effective in logistic regression.

## Installing package

In [102]:
!pip install yfinance
!pip install statsmodels
!pip install xgboost

import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm
from datetime import datetime, timedelta
from pytz import timezone, utc
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.linear_model import LogisticRegression



## Download SPY history price from Yahoo Finance

In [103]:
# Date setup
date = datetime.now(tz=utc)
today = date.astimezone(timezone('US/Pacific'))
today = today+ timedelta(days=1)
today_date = today.strftime("%Y-%m-%d")

# Data Retrieval
start_date = "2003-01-01"
end_date = today_date
ticker = "SPY"
yfdata = yf.download(ticker, start_date, end_date)
SPY_df = yfdata[["Close"]]
SPY_df = SPY_df.reset_index()




[*********************100%%**********************]  1 of 1 completed


## Set up predictors


In [104]:
features = ['1W_diff','3W_diff','10W_diff','1diff', '2diff', '3diff', '4diff', 'dir', '2dir' , '3dir' , '4dir' ,'signal',"MACD10","MACD20","MACD30","SO10","SO20","SO30"]
for i in range(1, 6):
    SPY_df[f'{i}diff'] = -(SPY_df['Close'].shift(i) - SPY_df['Close']) / SPY_df['Close'].shift(i)
SPY_df['dir'] = np.where(SPY_df['1diff'] >= 0, 1, 0)
SPY_df['2dir'] = np.where(SPY_df['1diff'].shift(1) >= 0, 1, 0)
SPY_df['3dir'] = np.where(SPY_df['1diff'].shift(2) >= 0, 1, 0)
SPY_df['4dir'] = np.where(SPY_df['1diff'].shift(3) >= 0, 1, 0)
for i in range(2, 5):
    condition = np.prod([SPY_df['dir'].shift(j) == 0 for j in range(i)], axis=0) & (SPY_df['dir'] == 0)
    SPY_df[f'{i}down'] = np.where(condition, 1, 0)
for i in range(2, 5):
    condition = np.prod([SPY_df['dir'].shift(j) == 1 for j in range(i)], axis=0) & (SPY_df['dir'] == 1)
    SPY_df[f'{i}up'] = np.where(condition, 1, 0)
SPY_df['1W_diff'] = -(SPY_df['Close'].shift(5) - SPY_df['Close']) / SPY_df['Close'].shift(5)
SPY_df['3W_diff'] = -(SPY_df['Close'].shift(15) - SPY_df['Close']) / SPY_df['Close'].shift(15)
SPY_df['10W_diff'] = -(SPY_df['Close'].shift(50) - SPY_df['Close']) / SPY_df['Close'].shift(50)
SPY_df['bottom'] = np.nan
SPY_df['top'] = np.nan
SPY_df['signal'] = np.nan
SPY_df['bottom'] = np.where((SPY_df['1diff'].shift(1) < 0) & (SPY_df['1diff'] > 0), SPY_df['Close'].shift(1), SPY_df['bottom'].shift(1))
SPY_df['bottom'].fillna(method='ffill', inplace=True)
SPY_df['top'] = np.where((SPY_df['1diff'].shift(1) > 0) & (SPY_df['1diff'] < 0), SPY_df['Close'].shift(1), SPY_df['top'].shift(1))
SPY_df['top'].fillna(method='ffill', inplace=True)
SPY_df['signal'] = np.where(SPY_df['Close'] >= SPY_df['top'], 1, np.where(SPY_df['Close'] <= SPY_df['bottom'], 0, SPY_df['signal'].shift(1)))
SPY_df['signal'].fillna(method='ffill', inplace=True)
SPY_df['Mean10'] = SPY_df['Close'].rolling(window=10).mean()
SPY_df['MACD10'] = (SPY_df['Close'] - SPY_df['Mean10']) / SPY_df['Mean10']
SPY_df['Mean20'] = SPY_df['Close'].rolling(window=20).mean()
SPY_df['MACD20'] = (SPY_df['Close'] - SPY_df['Mean20']) / SPY_df['Mean20']
SPY_df['Mean30'] = SPY_df['Close'].rolling(window=30).mean()
SPY_df['MACD30'] = (SPY_df['Close'] - SPY_df['Mean30']) / SPY_df['Mean30']
SPY_df['Max10'] = SPY_df['Close'].rolling(window=10).max()
SPY_df['Min10'] = SPY_df['Close'].rolling(window=10).min()
SPY_df['SO10'] = (SPY_df['Close'] - SPY_df['Min10']) / (SPY_df['Max10']-SPY_df['Min10'])
SPY_df['Max20'] = SPY_df['Close'].rolling(window=20).max()
SPY_df['Min20'] = SPY_df['Close'].rolling(window=20).min()
SPY_df['SO20'] = (SPY_df['Close'] - SPY_df['Min20']) / (SPY_df['Max20']-SPY_df['Min20'])
SPY_df['Max30'] = SPY_df['Close'].rolling(window=30).max()
SPY_df['Min30'] = SPY_df['Close'].rolling(window=30).min()
SPY_df['SO30'] = (SPY_df['Close'] - SPY_df['Min30']) / (SPY_df['Max30']-SPY_df['Min30'])
SPY_df['fut_dir'] = SPY_df['dir'].shift(-1)

## Spliting data into training and testing dataset

In [105]:

# Splitting Data
X_train = SPY_df.iloc[100:4000][features]
y_train = SPY_df.iloc[100:4000]['fut_dir']
X_test = SPY_df.iloc[4001:5100][features]
y_test = SPY_df.iloc[4001:5100]['fut_dir']


## Use logistic regression to find out which predictor is useful (backward elimination with AIC)

In [106]:
# Backward Elimination using AIC
X_train_with_const = sm.add_constant(X_train)
while (len(features) > 0):
    features_with_constant = ['const'] + features
    model = sm.Logit(y_train, X_train_with_const[features_with_constant]).fit(disp=0)
    max_aic = float("inf")
    worst_feature = None
    for feature in features:
        temp_features = features[:]
        temp_features.remove(feature)
        temp_features_with_const = ['const'] + temp_features
        temp_model = sm.Logit(y_train, X_train_with_const[temp_features_with_const]).fit(disp=0)
        if temp_model.aic < max_aic:
            max_aic = temp_model.aic
            worst_feature = feature
    if max_aic < model.aic:
        features.remove(worst_feature)
    else:
        break

## Build the model with XGBoost Classifer

In [110]:

# Refitting the model using XGBoost
xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=10, max_depth = 4)
xgb_model.fit(X_train[features], y_train)
y_pred = xgb_model.predict(X_test[features])


## Result of training

In [111]:
# Training evaluation
y_train_pred = xgb_model.predict(X_train[features])
accuracy = accuracy_score(y_train, y_train_pred)
print(f"XGBoost Model Accuracy of training dataset after backward elimination (AIC): {accuracy*100:.2f}%")
print("Selected features:", features)
print("Confusion Matrix of training dataset:")
print(confusion_matrix(y_train, y_train_pred))

XGBoost Model Accuracy of training dataset after backward elimination (AIC): 60.87%
Selected features: ['3W_diff', '1diff', '3diff', '3dir', '4dir', 'SO10']
Confusion Matrix of training dataset:
[[ 422 1333]
 [ 193 1952]]


## Evaluation of applying the model to the testing dataset

In [112]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy after backward elimination (AIC): {accuracy*100:.2f}%")
print("Selected features:", features)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



XGBoost Model Accuracy after backward elimination (AIC): 54.96%
Selected features: ['3W_diff', '1diff', '3diff', '3dir', '4dir', 'SO10']
Confusion Matrix:
[[126 376]
 [119 478]]


This model archieved a Precision (Positive predictive value) of : 478/(376+478) = 55.97%