# Import Library

In [None]:
import yfinance as yf
import marketobserve as mo
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import seaborn as sns
import datetime as dt
from matplotlib.ticker import PercentFormatter
import os
from fredapi import Fred



# Bull and Bear Markets

In [None]:
# download data from yfinance
"""
If failed, try:
    pip uninstall yfinance
    pip install yfinance --upgrade --no-cache-dir
"""
data = yf.download('^HSCE', start='1900-01-01',auto_adjust=False)[["Adj Close"]] # ^GSPC, ^HSI, ^HSCE, ^N225, 000300.SS
data.columns = ['Close']


# download data from local file
# data = pd.read_excel("spx.xlsx",index_col="Date")
# data.columns = ["Close"]
# data = data.sort_index(ascending=True)

# # download data from xbbg
# from xbbg import blp
# data = blp.bdh("SPX Index","PX_LAST","1900-01-01") # HSI, NKY, SPX
# data.columns = ["Close"]
# data = data.sort_index(ascending=True)

In [None]:
# data.to_excel("spx_wrangle.xlsx")
data
# data.describe()

In [None]:
# test
# mo.ChangeDistPlot(data, time_windows=['1Y', ('20250101', '20250401'), '100Y'], frequencies = ['W', 'M'])

In [None]:
# 生成示例数据

# 测试不同的 time_window 输入
data_test = data["Close"].copy()

time_window = ['1Y' ]
# mo.BullBearPlot(data_test, time_window)

# Real GDP QoQ Growth and SP500 Quarterly Return

In [None]:
# 初始化FRED接口
try:
    FRED_API_KEY = '60ff98537aa3cea94ede3173c3706e98'
    fred = Fred(api_key=FRED_API_KEY) # Set API directly
    # FRED_API_KEY = os.environ['FRED_API_KEY']  # Set API from envirenment
    print(f"Successfully obtained the FRED API key")

except Exception as e:
    raise ValueError("FRED API初始化失败，请检查API密钥有效性") from e


In [None]:
# 获取季度实际GDP数据（GDPC1）
real_qgdp = fred.get_series('GDPC1')
real_qgdp = pd.DataFrame(real_qgdp,columns=['RealGDP'])

# 获取SP500 Quarterly end price
# sp500_qend = yf.download('^GSPC', start='1900-01-01', interval='3mo')['Close']
sp500_qend = yf.download('^GSPC', start='1900-01-01')['Close']
sp500_qend.columns = ["SP500_QEnd"]
# Group the sp500_qend data by year and month
sp500_qend = sp500_qend.groupby(pd.Grouper(freq='Q')).last()# Select the last row of each group while preserving the original date index

# Convert the index to the first day of each quarter
index_quarter_start = pd.date_range(start=sp500_qend.index.min(), end=sp500_qend.index.max(), freq='QS')
sp500_qend = sp500_qend[1:]
sp500_qend.index = index_quarter_start


# Combine pd.Series real_gdp and pd.DataFrame sp500_qend into one dataframe, they are different index, add new columns: 'QoQsp500Return': 1 if pct_change > 0 otherwise -1;'QoQGDPGrowth': 1 if pct_change > 0 otherwise -1; 
quarterly_gdp_sp500_analysis_df = pd.concat([real_qgdp, sp500_qend], axis=1)

# Calculate the percentage change for each series
quarterly_gdp_sp500_analysis_df['RealGDP_QoQ_Growth'] = (quarterly_gdp_sp500_analysis_df['RealGDP'].pct_change()*100)
quarterly_gdp_sp500_analysis_df['SP500_QoQ_Return'] = (quarterly_gdp_sp500_analysis_df['SP500_QEnd'].pct_change()*100)
# quarterly_gdp_sp500_analysis_df.dropna(inplace=True)

# # Create new columns based on the percentage change
quarterly_gdp_sp500_analysis_df['RealGDP_QoQ_Growth_Signal'] = np.where(quarterly_gdp_sp500_analysis_df['RealGDP_QoQ_Growth'] > 0, 1, -1)
quarterly_gdp_sp500_analysis_df['SP500_QoQ_Return_Signal'] = np.where(quarterly_gdp_sp500_analysis_df['SP500_QoQ_Return'] > 0, 1, -1)

# Add columns of quarterly_gdp_sp500_analysis_df: shift 1 to 4 of SP500_QoQ_Return 
lags = range(1, 5)
for i in lags:
    quarterly_gdp_sp500_analysis_df[f'SP500_QoQ_Return_Lag{i}Q'] = quarterly_gdp_sp500_analysis_df['SP500_QoQ_Return'].shift(i)
for i in lags:
    quarterly_gdp_sp500_analysis_df[f'SP500_QoQ_Return_Signal_Lag{i}Q'] = quarterly_gdp_sp500_analysis_df['SP500_QoQ_Return_Signal'].shift(i)



# Drop the intermediate percentage change columns if you don't need them
# quarterly_gdp_sp500_analysis_df = quarterly_gdp_sp500_analysis_df.drop(columns=['SP500_QEnd', 'RealGDP'])
quarterly_gdp_sp500_analysis_df = quarterly_gdp_sp500_analysis_df.iloc[:-1]
quarterly_gdp_sp500_analysis_df.dropna(inplace=True)




In [None]:
quarterly_gdp_sp500_analysis_df.head()

## Correlation Analysis

Here use "lag" "QoQ SP500 return". \
Extension analysis: 
- "lag" "90 day" "SP500 return" "crossing a quarter"; 
- "lag" "variable day range" "SP500 return" "crossing a quarter"；
- "variable rolling window";

Correlation Matrix

In [None]:
# Calculate the correlation matrix
correlation_matrix = quarterly_gdp_sp500_analysis_df.iloc[:,:8].corr()

# Optionally, you can visualize the correlation matrix using seaborn
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Quarterly GDP and SP500 Analysis')
plt.show()


Rolling Correlation Dynamics

In [None]:
# Create a figure with n rows and 2 columns
fig, axes = plt.subplots(len(lags), 2, figsize=(18, 5 * len(lags)))

for i, lag in enumerate(lags):
    independ_col = f'SP500_QoQ_Return_Lag{lag}Q'
    depend_col = 'RealGDP_QoQ_Growth'

    # Define a color palette for different window sizes
    window_sizes = [8, 20] 
    colors = plt.cm.viridis(np.linspace(0, 1, len(window_sizes)))

    # Line chart for correlations
    for j, window in enumerate(window_sizes):
        # Calculate rolling correlation
        rolling_corr = quarterly_gdp_sp500_analysis_df[[independ_col, depend_col]].rolling(window=window).corr()
        rolling_corr = rolling_corr.xs(independ_col, level=1)[depend_col]
        
        # Plot the rolling correlation with a unified color
        axes[i, 0].plot(rolling_corr.index, rolling_corr, label=f'Window: {window}', color=colors[j])
    
    axes[i, 0].set_title(f'Rolling Correlation between {independ_col} and {depend_col}')
    axes[i, 0].set_xlabel('Date')
    axes[i, 0].set_ylabel('Correlation')
    axes[i, 0].legend()
    axes[i, 0].grid(True)

    # Distribution chart for correlations
    for j, window in enumerate(window_sizes):
        # Calculate rolling correlation
        rolling_corr = quarterly_gdp_sp500_analysis_df[[independ_col, depend_col]].rolling(window=window).corr()
        rolling_corr = rolling_corr.xs(independ_col, level=1)[depend_col].dropna()
        # Plot the distribution chart with a unified color
        sns.histplot(rolling_corr, ax=axes[i, 1], kde=True, label=f'Window: {window}', color=colors[j])
    
    axes[i, 1].set_title(f'Distribution of Rolling Correlation between {independ_col} and {depend_col}')
    axes[i, 1].set_xlabel('Correlation')
    axes[i, 1].set_ylabel('Frequency')
    axes[i, 1].legend()

plt.tight_layout()
plt.show()

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Create a figure with n rows and 2 columns
fig, axes = plt.subplots(len(lags), 2, figsize=(18, 5 * len(lags)))

for i, lag in enumerate(lags):
    independ_col = f'SP500_QoQ_Return_Signal_Lag{lag}Q'
    depend_col = 'RealGDP_QoQ_Growth_Signal'
    
    # Calculate the confusion matrix
    valid_data = quarterly_gdp_sp500_analysis_df[[depend_col, independ_col]].dropna()
    cm = confusion_matrix(valid_data[depend_col], valid_data[independ_col])
    
    # Calculate the percentage for each cell
    cm_percentage = cm / cm.sum() * 100
    
    # Plot the confusion matrix
    sns.heatmap(cm_percentage, annot=True, fmt='.0f', ax=axes[i, 0], cmap='Blues')
    axes[i, 0].set_title(f'Confusion Matrix for Lag {lag}Q')
    axes[i, 0].set_xlabel('Predicted Label [%]')
    axes[i, 0].set_ylabel('True Label [%]')
    
    # Calculate the curve
    y_true = valid_data[depend_col]
    y_scores = valid_data[independ_col]
    
    # Calculate rolling accuracy period by period
    rolling_windows = [8, 20, 40]  # You can adjust this window size
    for rolling_window in rolling_windows:
        # Calculate rolling accuracy
        rolling_accuracy = ( ((y_true == y_scores).rolling(window=rolling_window)).sum() / rolling_window) * 100

        # Plot the rolling accuracy in the correct subplot
        axes[i, 1].plot(rolling_accuracy, label=f'Rolling Accuracy ({rolling_window} periods)')
    
    axes[i, 1].set_xlabel('Period')
    axes[i, 1].set_ylabel('Rolling Accuracy (%)')
    axes[i, 1].set_title(f'Rolling Accuracy for Lag {lag}Q')
    axes[i, 1].legend()

# Display the figure outside the loop
plt.tight_layout()
plt.show()