This is an initial testing ground for any idea to expand or test during this process of learning:

In [1]:
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot

In [2]:
sp500 = yf.Ticker('^GSPC') #load the data
sp500 = sp500.history(period="max") #collect the historical data from creation of s&p500
sp500 #give us a panda dataframe of every trading day in sp500 history

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,0.0,0.0
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,0.0,0.0
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,0.0,0.0
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,5639.660156,5651.620117,5602.339844,5616.839844,2938570000,0.0,0.0
2024-08-27 00:00:00-04:00,5602.890137,5631.180176,5593.479980,5625.799805,2798990000,0.0,0.0
2024-08-28 00:00:00-04:00,5624.509766,5627.029785,5560.950195,5592.180176,3053450000,0.0,0.0
2024-08-29 00:00:00-04:00,5607.299805,5646.950195,5583.709961,5591.959961,3065640000,0.0,0.0


In [3]:
# Safely remove the 'Dividends' column if it exists
if 'Dividends' in sp500.columns:
    del sp500['Dividends']

# Safely remove the 'Stock Splits' column if it exists
if 'Stock Splits' in sp500.columns:
    del sp500['Stock Splits']

sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0
...,...,...,...,...,...
2024-08-26 00:00:00-04:00,5639.660156,5651.620117,5602.339844,5616.839844,2938570000
2024-08-27 00:00:00-04:00,5602.890137,5631.180176,5593.479980,5625.799805,2798990000
2024-08-28 00:00:00-04:00,5624.509766,5627.029785,5560.950195,5592.180176,3053450000
2024-08-29 00:00:00-04:00,5607.299805,5646.950195,5583.709961,5591.959961,3065640000


### 1. Basic infos
Understand the structure of your dataset, including the data types and summary statistics.

In [4]:
# Display the first few rows of the dataset
print(sp500.head())

                                Open       High        Low      Close  Volume
Date                                                                         
1927-12-30 00:00:00-05:00  17.660000  17.660000  17.660000  17.660000       0
1928-01-03 00:00:00-05:00  17.760000  17.760000  17.760000  17.760000       0
1928-01-04 00:00:00-05:00  17.719999  17.719999  17.719999  17.719999       0
1928-01-05 00:00:00-05:00  17.549999  17.549999  17.549999  17.549999       0
1928-01-06 00:00:00-05:00  17.660000  17.660000  17.660000  17.660000       0


In [5]:
# Get a summary of the DataFrame, including non-null counts and data types
print(sp500.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 24283 entries, 1927-12-30 00:00:00-05:00 to 2024-08-30 00:00:00-04:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    24283 non-null  float64
 1   High    24283 non-null  float64
 2   Low     24283 non-null  float64
 3   Close   24283 non-null  float64
 4   Volume  24283 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 1.1 MB
None


In [6]:
# Summary statistics for numerical columns
print(sp500.describe())

               Open          High           Low         Close        Volume
count  24283.000000  24283.000000  24283.000000  24283.000000  2.428300e+04
mean     620.541381    644.079926    636.370648    640.470631  9.087460e+08
std     1054.323533   1049.074774   1037.278240   1043.551767  1.619754e+09
min        0.000000      4.400000      4.400000      4.400000  0.000000e+00
25%        9.700000     24.690001     24.690001     24.690001  1.525000e+06
50%       42.799999    103.150002    101.449997    102.290001  2.041000e+07
75%     1031.375000   1039.390015   1024.989990   1031.385010  9.871500e+08
max     5644.089844   5669.669922   5639.020020   5667.200195  1.145623e+10


### 2. Prediction SetUp

In [7]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,17.760000
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,17.719999
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,17.549999
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,17.660000
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,17.500000
...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,5639.660156,5651.620117,5602.339844,5616.839844,2938570000,5625.799805
2024-08-27 00:00:00-04:00,5602.890137,5631.180176,5593.479980,5625.799805,2798990000,5592.180176
2024-08-28 00:00:00-04:00,5624.509766,5627.029785,5560.950195,5592.180176,3053450000,5591.959961
2024-08-29 00:00:00-04:00,5607.299805,5646.950195,5583.709961,5591.959961,3065640000,5648.399902


In [8]:
sp500["Target"]= (sp500["Tomorrow"]>sp500["Close"].astype(int))
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,17.760000,True
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,17.719999,True
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,17.549999,True
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,17.660000,True
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,17.500000,True
...,...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,5639.660156,5651.620117,5602.339844,5616.839844,2938570000,5625.799805,True
2024-08-27 00:00:00-04:00,5602.890137,5631.180176,5593.479980,5625.799805,2798990000,5592.180176,False
2024-08-28 00:00:00-04:00,5624.509766,5627.029785,5560.950195,5592.180176,3053450000,5591.959961,False
2024-08-29 00:00:00-04:00,5607.299805,5646.950195,5583.709961,5591.959961,3065640000,5648.399902,True
