In [39]:
import yfinance as yf
from sklearn.metrics import precision_score
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [26]:
sp500_ticker = yf.Ticker('^GSPC')
sp500 = sp500_ticker.history(period='1mo', interval='2m')

In [27]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-11-05 09:30:00-05:00,5722.430176,5732.859863,5722.100098,5732.669922,14460545,0.0,0.0
2024-11-05 09:32:00-05:00,5732.750000,5737.229980,5732.750000,5737.060059,24244527,0.0,0.0
2024-11-05 09:34:00-05:00,5737.060059,5738.660156,5735.620117,5736.759766,19642099,0.0,0.0
2024-11-05 09:36:00-05:00,5736.580078,5740.200195,5735.040039,5739.890137,16773801,0.0,0.0
2024-11-05 09:38:00-05:00,5739.810059,5741.729980,5739.240234,5741.490234,17344373,0.0,0.0
...,...,...,...,...,...,...,...
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000,0.0,0.0
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000,0.0,0.0
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000,0.0,0.0
2024-12-04 15:56:00-05:00,6085.649902,6087.049805,6085.279785,6087.049805,44184000,0.0,0.0


In [28]:
sp500 = sp500.drop(['Dividends', 'Stock Splits'], axis=1)

In [29]:
sp500 = sp500.sort_index(ascending=True)

In [30]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-11-05 09:30:00-05:00,5722.430176,5732.859863,5722.100098,5732.669922,14460545
2024-11-05 09:32:00-05:00,5732.750000,5737.229980,5732.750000,5737.060059,24244527
2024-11-05 09:34:00-05:00,5737.060059,5738.660156,5735.620117,5736.759766,19642099
2024-11-05 09:36:00-05:00,5736.580078,5740.200195,5735.040039,5739.890137,16773801
2024-11-05 09:38:00-05:00,5739.810059,5741.729980,5739.240234,5741.490234,17344373
...,...,...,...,...,...
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000
2024-12-04 15:56:00-05:00,6085.649902,6087.049805,6085.279785,6087.049805,44184000


In [31]:
def create_obv(df: pd.DataFrame) -> pd.Series:
    temp = df.copy()
    
    temp['Change'] = temp['Close'].diff()
    temp['Direction'] = temp['Change'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    temp['Volume_adj'] = temp['Volume'] * temp['Direction']
    temp['OBV'] = temp['Volume_adj'].cumsum()
    
    return temp['OBV']

In [32]:
sp500['OBV'] = create_obv(sp500)

sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OBV
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-11-05 09:30:00-05:00,5722.430176,5732.859863,5722.100098,5732.669922,14460545,0
2024-11-05 09:32:00-05:00,5732.750000,5737.229980,5732.750000,5737.060059,24244527,24244527
2024-11-05 09:34:00-05:00,5737.060059,5738.660156,5735.620117,5736.759766,19642099,4602428
2024-11-05 09:36:00-05:00,5736.580078,5740.200195,5735.040039,5739.890137,16773801,21376229
2024-11-05 09:38:00-05:00,5739.810059,5741.729980,5739.240234,5741.490234,17344373,38720602
...,...,...,...,...,...,...
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000,1205360598
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000,1181159598
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000,1140543598
2024-12-04 15:56:00-05:00,6085.649902,6087.049805,6085.279785,6087.049805,44184000,1184727598


In [33]:
def create_rsi(df: pd.DataFrame, n: int) -> int:
    
    delta = df['Close'].diff()
    
    gains = delta.where(delta > 0, 0)
    losses = delta.where(delta < 0, 0)
    
    avg_gain = gains.rolling(window=n, min_periods=n).mean()
    avg_loss = losses.rolling(window=n, min_periods=n).mean()
    
    rs = avg_gain / avg_loss
    
    rsi = 100 - (100 / (1 + rs))
    
    return rsi

In [34]:
sp500['RSI'] = create_rsi(sp500, n=14)

sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OBV,RSI
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-11-05 09:30:00-05:00,5722.430176,5732.859863,5722.100098,5732.669922,14460545,0,
2024-11-05 09:32:00-05:00,5732.750000,5737.229980,5732.750000,5737.060059,24244527,24244527,
2024-11-05 09:34:00-05:00,5737.060059,5738.660156,5735.620117,5736.759766,19642099,4602428,
2024-11-05 09:36:00-05:00,5736.580078,5740.200195,5735.040039,5739.890137,16773801,21376229,
2024-11-05 09:38:00-05:00,5739.810059,5741.729980,5739.240234,5741.490234,17344373,38720602,
...,...,...,...,...,...,...,...
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000,1205360598,161.692899
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000,1181159598,239.481213
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000,1140543598,340.677966
2024-12-04 15:56:00-05:00,6085.649902,6087.049805,6085.279785,6087.049805,44184000,1184727598,187.191614


In [35]:
sp500['Next Close'] = sp500['Close'].shift(-1)

In [36]:
sp500['Target'] = (sp500['Next Close'] > sp500['Close']).astype(int)

In [37]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OBV,RSI,Next Close,Target
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-11-05 09:30:00-05:00,5722.430176,5732.859863,5722.100098,5732.669922,14460545,0,,5737.060059,1
2024-11-05 09:32:00-05:00,5732.750000,5737.229980,5732.750000,5737.060059,24244527,24244527,,5736.759766,0
2024-11-05 09:34:00-05:00,5737.060059,5738.660156,5735.620117,5736.759766,19642099,4602428,,5739.890137,1
2024-11-05 09:36:00-05:00,5736.580078,5740.200195,5735.040039,5739.890137,16773801,21376229,,5741.490234,1
2024-11-05 09:38:00-05:00,5739.810059,5741.729980,5739.240234,5741.490234,17344373,38720602,,5739.129883,0
...,...,...,...,...,...,...,...,...,...
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000,1205360598,161.692899,6086.209961,0
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000,1181159598,239.481213,6085.689941,0
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000,1140543598,340.677966,6087.049805,1
2024-12-04 15:56:00-05:00,6085.649902,6087.049805,6085.279785,6087.049805,44184000,1184727598,187.191614,6086.229980,0


In [38]:
sp500 = sp500.dropna()

In [43]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OBV,RSI,Next Close,Target
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-11-05 09:56:00-05:00,5754.529785,5755.000000,5752.939941,5753.029785,12616130,49366762,131.194091,5756.299805,1
2024-11-05 09:58:00-05:00,5752.970215,5756.330078,5752.930176,5756.299805,11834727,61201489,126.877299,5759.490234,1
2024-11-05 10:00:00-05:00,5756.479980,5760.500000,5756.479980,5759.490234,16892629,78094118,128.314866,5759.830078,1
2024-11-05 10:02:00-05:00,5759.529785,5761.629883,5757.899902,5759.830078,13622793,91716911,126.227565,5758.689941,0
2024-11-05 10:04:00-05:00,5759.939941,5760.250000,5757.509766,5758.689941,11194895,80522016,138.249961,5757.569824,0
...,...,...,...,...,...,...,...,...,...
2024-12-04 15:48:00-05:00,6086.169922,6087.850098,6086.169922,6087.770020,15813000,1232630598,175.010060,6087.350098,0
2024-12-04 15:50:00-05:00,6087.799805,6089.839844,6086.979980,6087.350098,27270000,1205360598,161.692899,6086.209961,0
2024-12-04 15:52:00-05:00,6087.509766,6087.560059,6086.029785,6086.209961,24201000,1181159598,239.481213,6085.689941,0
2024-12-04 15:54:00-05:00,6086.319824,6087.399902,6084.959961,6085.689941,40616000,1140543598,340.677966,6087.049805,1


In [59]:
predictors = ['High', 'Low', 'Close', 'Volume', 'OBV', 'RSI']

X_train, X_test, y_train, y_test = train_test_split(sp500[predictors], sp500['Target'], test_size=0.2)

model = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.2, 
    objective='binary:logistic'
)

In [60]:
print(X_train.isna().sum())  # Check for NaN values
print((X_train == float('inf')).sum())  # Check for inf values
print((X_train == float('-inf')).sum())  # Check for -inf values

High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       0
dtype: int64
High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       0
dtype: int64
High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       1
dtype: int64


In [61]:
X_train = X_train.replace([float('inf'), float('-inf')], float('nan'))
X_train = X_train.dropna()

In [62]:
print(X_test.isna().sum())  # Check for NaN values
print((X_test == float('inf')).sum())  # Check for inf values
print((X_test == float('-inf')).sum())  # Check for -inf values

High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       0
dtype: int64
High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       0
dtype: int64
High      0
Low       0
Close     0
Volume    0
OBV       0
RSI       0
dtype: int64


In [63]:
valid_indices = X_train.index.intersection(y_train.index)
X_train = X_train.loc[valid_indices]
y_train = y_train.loc[valid_indices]

In [64]:
print(X_train.shape)
print(y_train.shape)

(3186, 6)
(3186,)


In [65]:
model.fit(X_train, y_train)

In [66]:
prediction = model.predict(X_test)

In [67]:
prediction = pd.Series(prediction, index=y_test.index)

In [68]:
prediction

Datetime
2024-11-25 14:44:00-05:00    1
2024-12-03 12:36:00-05:00    1
2024-11-25 15:16:00-05:00    0
2024-11-12 13:14:00-05:00    0
2024-11-11 12:40:00-05:00    0
                            ..
2024-12-04 14:42:00-05:00    0
2024-11-26 09:30:00-05:00    1
2024-11-26 11:52:00-05:00    1
2024-11-14 09:40:00-05:00    0
2024-11-21 14:50:00-05:00    1
Length: 797, dtype: int32

In [69]:
precision = precision_score(y_test, prediction)

print(precision)

0.4988610478359909
