In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
from pathlib import Path

# --- Load and Clean the Data ---
DATA_PATH = Path("../data/SPY_data.csv")

df = pd.read_csv(
    DATA_PATH, 
    index_col=0, 
    parse_dates=True, 
    date_format='%Y-%m-%d'
)

# --- Data Cleaning Step ---
# 1. Try to convert the 'Close' column to numbers.
#    errors='coerce' will replace any non-numeric values (like 'SPY') with NaN (Not a Number).
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

# 2. Drop any rows that have NaN in the 'Close' column.
rows_before = len(df)
df.dropna(subset=['Close'], inplace=True)
rows_after = len(df)

print(f"Removed {rows_before - rows_after} corrupted row(s).")

# --- Initial Inspection ---
print("\nFirst 5 rows of the cleaned dataset:")
display(df.head())

# Display a summary of the cleaned dataframe
print("\nDataset Info:")
df.info()

Removed 2 corrupted row(s).

First 5 rows of the cleaned dataset:


Unnamed: 0_level_0,Close,High,Low,Open,Volume
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,299.406464,299.4249140098017,297.24987779194856,298.18072230420165,59151200
2020-01-03,297.139282,298.2728840920659,295.93196319223426,295.98725810025013,77709700
2020-01-06,298.272919,298.3558610728196,295.2499800424061,295.3697950655392,55653900
2020-01-07,297.434174,298.180682883535,296.98256105203933,297.7014229007024,40496400
2020-01-08,299.01947,300.2452118245496,297.3789885629929,297.62781571292214,68296000



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1258 entries, 2020-01-02 to 2024-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   1258 non-null   float64
 1   High    1258 non-null   object 
 2   Low     1258 non-null   object 
 3   Open    1258 non-null   object 
 4   Volume  1258 non-null   object 
dtypes: float64(1), object(4)
memory usage: 59.0+ KB


In [2]:
# --- Visualize the Closing Price ---

# This creates an interactive line chart of the 'Close' column.
# The index (our Date column) is automatically used for the x-axis.
fig = px.line(df, y='Close', title='SPY Closing Price Over Time')

# Add range slider for better navigation
fig.update_xaxes(rangeslider_visible=True)

fig.show()

In [3]:
import statsmodels.api as sm
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Decompose the Time Series ---

# We'll use an additive model, which assumes that:
# Observed Value = Trend + Seasonality + Residual
# We'll look for a yearly seasonal pattern, so we set the period to 365 days.
decomposition = sm.tsa.seasonal_decompose(df['Close'], model='additive', period=365)

# --- Plot the Decomposition ---

# Create a figure with 4 stacked subplots
fig = make_subplots(
    rows=4, cols=1,
    shared_xaxes=True,
    subplot_titles=("Observed", "Trend", "Seasonal", "Residuals")
)

# Add each component to its own subplot
fig.add_trace(go.Scatter(x=df.index, y=decomposition.observed, mode='lines', name='Observed'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=decomposition.trend, mode='lines', name='Trend'), row=2, col=1)
fig.add_trace(go.Scatter(x=df.index, y=decomposition.seasonal, mode='lines', name='Seasonal'), row=3, col=1)
fig.add_trace(go.Scatter(x=df.index, y=decomposition.resid, mode='lines', name='Residuals'), row=4, col=1)

# Update layout for a cleaner look
fig.update_layout(height=700, title_text="Time Series Decomposition")
fig.show()