Notebook used to inspect results of Granger Causality analysis

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import math
import sys
from pathlib import Path

# Get the project root: notebooks/AI_narrative_index
root_dir = Path.cwd().parent

# Add src/scripts to the Python modules search path
sys.path.append(str(root_dir / "src" / "scripts"))

# import custom functions
from plot_granger_causality import plot_aini_lags_by_year, plot_aini_lags_for_year

In [7]:
# Get the project root: notebooks/AI_narrative_index
root_dir = Path.cwd().parent

# set variable path
var_path = root_dir / "data" / "processed" / "variables"

# load data
gc_df = pd.read_csv(var_path / "gc_bootstrap_F_all_lags_groupwise_AINI_to_return.csv")
gc_stat = pd.read_csv(var_path / "gc_with_heteroskedasticity_tests_standard.csv")
#rev_gc_df = pd.read_csv(var_path / "gc_reversed_results_with_tstats.csv")
gc_df = gc_df.drop(columns=["coef_const"])

# get financial data path
project_root = Path().resolve().parents[0] 
fin_path = project_root / "data" / "raw" / "financial" 

# load financial data
fin_data = pd.read_csv(fin_path / "full_daily_2023_2025.csv")

# ensure date is in right format
fin_data['Date'] = pd.to_datetime(fin_data['Date'])

# rename columns to improve clarity
gc_df.rename(columns=
    {
    'AINI_variant':'Variable',
}
,inplace=True)

# rename AINI_variant for readability in plots
gc_df["Variable"] = gc_df["Variable"].replace({
    "normalized_AINI": "AINInorm",
    "normalized_AINI_growth": "AINIgrowth",
    "relative_AINI_month": "AINI_rm",
    "relative_AINI_weekly": "AINI_rw",
})

gc_df["Year"] = gc_df["Year"].replace({
    "2023_24": "2023-2024",
    "2023_24_25": "2023-2025",
    "2024_25": "2024-2025",
})


# Sort by ticker, year
gc_df_sort = gc_df.sort_values(by=["Ticker", "Year"])
gc_df


Unnamed: 0,Ticker,Variable,Year,Lag,Original_F_p,Empirical_F_p,AIC,BIC,N_boot,Reverse,...,coef_x3,coef_x4,coef_x5,coef_x6,coef_x7,coef_x8,coef_x9,coef_x10,BH_reject_F,BH_corr_F
0,AAPL,AINInorm,2023,1,0.847089,0.866,-1120.646727,-1114.195234,1000,False,...,,,,,,,,,False,0.998
1,AAPL,AINInorm,2023,2,0.763285,0.758,-1112.706237,-1103.045170,1000,False,...,,,,,,,,,False,0.998
2,AAPL,AINInorm,2023,3,0.841075,0.845,-1104.657238,-1091.797495,1000,False,...,0.064862,,,,,,,,False,0.998
3,AAPL,AINInorm,2023,4,0.361583,0.380,-1097.826385,-1081.778954,1000,False,...,0.062818,-0.000329,,,,,,,False,0.998
4,AAPL,AINInorm,2023,5,0.406673,0.405,-1089.856961,-1070.632921,1000,False,...,0.057920,0.004425,-0.065424,,,,,,False,0.998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,TSM,AINI_rm,2024-2025,6,0.875417,0.852,-1474.262571,-1447.439393,1000,False,...,-0.106633,-0.041839,-0.019928,-0.023816,,,,,False,0.989
8996,TSM,AINI_rm,2024-2025,7,0.937276,0.932,-1467.260784,-1436.629219,1000,False,...,-0.105799,-0.040778,-0.020854,-0.023497,0.016928,,,,False,0.989
8997,TSM,AINI_rm,2024-2025,8,0.931991,0.928,-1459.995848,-1425.561847,1000,False,...,-0.105829,-0.040121,-0.019022,-0.023242,0.018867,0.015003,,,False,0.989
8998,TSM,AINI_rm,2024-2025,9,0.956713,0.961,-1452.685415,-1414.454956,1000,False,...,-0.105562,-0.039862,-0.018493,-0.021967,0.018856,0.016231,0.011912,,False,0.989


In [10]:
te = pd.read_csv(var_path / "combined_te_results.csv")
te

EmptyDataError: No columns to parse from file

In [5]:
# Ensure p-values are float
gc_df["p_value"] = gc_df["p_value"].astype(float)
rev_gc_df["p_value"] = gc_df["p_value"].astype(float)  # Assuming same p-values for reverse

# Filter for significant Granger causality results ---
gc_df_sign = gc_df[gc_df["p_value"] < 0.05].copy()
rev_gc_df_sign = rev_gc_df[rev_gc_df["p_value"] < 0.05].copy()

# --- Identify index of best AIC/BIC model within each (Ticker, Year) group ---
idx_best_aic = gc_df_sign.groupby(["Ticker", "Year"])["AIC"].idxmin()
idx_best_bic = gc_df_sign.groupby(["Ticker", "Year"])["BIC"].idxmin()

# --- Mark best models in original DataFrame ---
gc_df_sign["best_aic"] = gc_df_sign.index.isin(idx_best_aic)
gc_df_sign["best_bic"] = gc_df_sign.index.isin(idx_best_bic)

rev_idx_best_aic = rev_gc_df_sign.groupby(["Ticker", "Year"])["AIC"].idxmin()
rev_idx_best_bic = rev_gc_df_sign.groupby(["Ticker", "Year"])["BIC"].idxmin()

rev_gc_df_sign["best_aic"] = rev_gc_df_sign.index.isin(rev_idx_best_aic)
rev_gc_df_sign["best_bic"] = rev_gc_df_sign.index.isin(rev_idx_best_bic)

# --- Identify all t-test p-value columns related to AINI lag coefficients ---
# (We exclude intercepts like 'const')
aini_p_cols = [col for col in gc_df_sign.columns if col.startswith("p_") and "const" not in col]

# --- Filter: best AIC and at least one significant AINI lag ---
gc_aic = gc_df_sign[
    gc_df_sign["best_aic"] &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Filter: best BIC and at least one significant AINI lag ---
gc_bic = gc_df_sign[
    gc_df_sign["best_bic"] &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Reverse causality: best AIC with at least one significant lag ---
rev_gc_aic = rev_gc_df_sign[
    rev_gc_df_sign["best_aic"] &
    (rev_gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Reverse causality: best BIC with at least one significant lag ---
rev_gc_bic = rev_gc_df_sign[
    rev_gc_df_sign["best_bic"] &
    (rev_gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]


KeyError: 'p_value'

Create html files for online appendix

In [None]:
# Define output path for HTML tables
table_path = root_dir / "reports" / "tables"

# --- Drop unnecessary columns ---
drop_cols = ["Reverse", "Lag"]

# Clean: all significant Granger test results
gc_df_sign = gc_df_sign.drop(columns=drop_cols)
rev_gc_df_sign = rev_gc_df_sign.drop(columns=drop_cols)

# Clean: best models (with at least one significant lag)
gc_aic = gc_aic.drop(columns=drop_cols)
gc_bic = gc_bic.drop(columns=drop_cols)
rev_gc_aic = rev_gc_aic.drop(columns=drop_cols)
rev_gc_bic = rev_gc_bic.drop(columns=drop_cols)

# --- Export to HTML (full filtered sets) ---
gc_df_sign.to_html(table_path / "granger_causality_all_preds.html", na_rep='')
rev_gc_df_sign.to_html(table_path / "reverse_granger_causality_all_preds.html", na_rep='')

# --- Export to HTML (best AIC/BIC models, with at least one significant AINI lag) ---
gc_aic.to_html(table_path / "granger_causality_best_aic.html", na_rep='')
gc_bic.to_html(table_path / "granger_causality_best_bic.html", na_rep='')
rev_gc_aic.to_html(table_path / "reverse_granger_causality_best_aic.html", na_rep='')
rev_gc_bic.to_html(table_path / "reverse_granger_causality_best_bic.html", na_rep='')


Create reporting tables for thesis

In [None]:
# --- Find best AIC/BIC per (Ticker, Variable, Year) ---
idx_best_aic = gc_df_sign.groupby(["Ticker", "Variable", "Year"])["AIC"].idxmin()
idx_best_bic = gc_df_sign.groupby(["Ticker", "Variable", "Year"])["BIC"].idxmin()

gc_df_sign["best_aic_ticker_variable_year"] = gc_df_sign.index.isin(idx_best_aic)
gc_df_sign["best_bic_ticker_variable_year"] = gc_df_sign.index.isin(idx_best_bic)

# --- Find best AIC/BIC per (Ticker, Year) only (aggregate across all variables) ---
idx_best_aic_ty = gc_df_sign.groupby(["Ticker", "Year"])["AIC"].idxmin()
idx_best_bic_ty = gc_df_sign.groupby(["Ticker", "Year"])["BIC"].idxmin()

gc_df_sign["best_aic_ticker_year"] = gc_df_sign.index.isin(idx_best_aic_ty)
gc_df_sign["best_bic_ticker_year"] = gc_df_sign.index.isin(idx_best_bic_ty)

# --- Identify all p-value columns of AINI lags (excluding constant/intercept) ---
aini_p_cols = [col for col in gc_df_sign.columns if col.startswith("p_") and "const" not in col]

# --- Filter 1: best AIC or BIC by (Ticker, Variable, Year) + significant AINI lag ---
gc_df_best_tv_y = gc_df_sign[
    (gc_df_sign["best_aic_ticker_variable_year"] | gc_df_sign["best_bic_ticker_variable_year"]) &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Filter 2: best AIC or BIC by (Ticker, Year) + significant AINI lag ---
gc_df_best_t_y = gc_df_sign[
    (gc_df_sign["best_aic_ticker_year"] | gc_df_sign["best_bic_ticker_year"]) &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Export Filter 1 (ticker-variable-year) to HTML/CSV ---
gc_df_best_tv_y.to_html(
    table_path / "granger_causality_best_aic_or_bic_ticker_variable_year.html", na_rep=''
)

# --- Export Filter 2 (ticker-year) to HTML/CSV ---
gc_df_best_t_y.to_html(
    table_path / "granger_causality_best_aic_or_bic_ticker_year.html", na_rep=''
)
gc_df_best_t_y.to_csv(
    var_path / "granger_causality_best_aic_or_bic_ticker_year.csv", index=False
)

# Optional preview
gc_df_best_t_y


re-perform Granger Causality analysis to include t-stats for lowest AIC/BIC variables

In [None]:
# group by year
gc_df_index = gc_df.set_index('Year')
gc_df.columns
gc_df_sort

In [None]:
# plot for different betas
lag_list = [f't-{i}' for i in range(1,21)]
for lag in lag_list:
    plot_aini_lags_by_year(gc_df_index, lag = lag)

In [None]:
for Year in set(gc_df.Year.values):
    plot_aini_lags_for_year(gc_df, Year)