Notebook used to inspect results of Granger Causality analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import math
import sys
from pathlib import Path
import re

# Get the project root: notebooks/AI_narrative_index
root_dir = Path.cwd().parent

# Add src/scripts to the Python modules search path
sys.path.append(str(root_dir / "src" / "scripts"))

# import custom functions
#rom plot_granger_causality import plot_aini_lags_by_year, plot_aini_lags_for_year

In [7]:
# Get the project root: notebooks/AI_narrative_index
root_dir = Path.cwd().parent

# set variable path
var_path = root_dir / "data" / "processed" / "variables"

# load data for aini -> return
gc_c = pd.read_csv(var_path / "granger_causality_n_articles_w2.csv")
gc_w0 = pd.read_csv(var_path / "granger_causality_w0.csv")
gc_w1 = pd.read_csv(var_path / "granger_causality_w1.csv")
gc_w2 = pd.read_csv(var_path / "granger_causality_w2.csv")

# define table path
table_path = root_dir / "reports" / "tables"

# Export as HTML for online appendix
gc_c.to_html(table_path / "granger_causality_custom_model.html", index=False)
gc_w0.to_html(table_path / "granger_causality_w0.html", index=False)
gc_w1.to_html(table_path / "granger_causality_w1.html", index=False)
gc_w2.to_html(table_path / "granger_causality_w2.html", index=False)
gc_c

Unnamed: 0,Ticker,AINI_variant,Year,Direction,β₀,A2R_beta_ret_1,β_ctrl_n_articles1,A2R_beta_x_1,p_x,N_obs,...,A2R_beta_x_2,R2A_beta_x_2,β_ctrl_n_articles2.1,R2A_beta_ret_2,A2R_beta_ret_3,β_ctrl_n_articles3,A2R_beta_x_3,R2A_beta_x_3,β_ctrl_n_articles3.1,R2A_beta_ret_3
0,AAPL,EMA_02,2023,AINI_to_RET,-0.002009,0.059581,0.000196,0.087792,1,186,...,,,,,,,,,,
1,AAPL,EMA_02,2023,RET_to_AINI,,,,,1,186,...,,,,,,,,,,
2,AAPL,EMA_08,2023,AINI_to_RET,-0.002118,0.064950,0.000166,0.000358,1,186,...,,,,,,,,,,
3,AAPL,EMA_08,2023,RET_to_AINI,,,,,1,186,...,,,,,,,,,,
4,AAPL,normalized_AINI,2023,AINI_to_RET,-0.002028,0.065729,0.000159,-0.008010,1,186,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,TSM,EMA_08,2024_25,RET_to_AINI,,,,,3,342,...,,0.166748,0.000390,-0.099557,,,,0.143909,-0.000079,-0.024210
2156,TSM,normalized_AINI,2024_25,AINI_to_RET,-0.000405,-0.097876,0.000079,0.016597,3,344,...,-0.076812,,,,-0.094885,0.000068,0.002028,,,
2157,TSM,normalized_AINI,2024_25,RET_to_AINI,,,,,3,342,...,,0.219659,0.000501,-0.142661,,,,0.133416,-0.000071,-0.047112
2158,TSM,normalized_AINI_z,2024_25,AINI_to_RET,-0.000310,-0.097876,0.000079,0.000866,3,344,...,-0.004006,,,,-0.094885,0.000068,0.000106,,,


In [3]:
# subset for p > 0.9
alpha = 0.05

gc_c_sub = gc_c[
    (gc_c["BH_corr_F_pval"] < alpha) | (gc_c["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w0_sub = gc_w0[
    (gc_w0["BH_corr_F_pval"] < alpha) | (gc_w0["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w1_sub = gc_w1[
    (gc_w1["BH_corr_F_pval"] < alpha) | (gc_w1["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w2_sub = gc_w2[
    (gc_w2["BH_corr_F_pval"] < alpha) | (gc_w2["BH_corr_F_pval_HC3"] < alpha)
].copy()

dfs = [gc_c_sub,gc_w0_sub,gc_w1_sub,gc_w2_sub]
gc_w1_sub

Unnamed: 0,Ticker,AINI_variant,Year,Direction,A2R_beta_const,A2R_beta_ret_lag1,A2R_beta_x_lag1,A2R_beta_x_lag2,A2R_beta_x_lag3,p_x,...,adj_r2_u,R2A_beta_const,R2A_beta_x_lag1,R2A_beta_x_lag2,R2A_beta_x_lag3,R2A_beta_ret_lag1,BH_reject_F,BH_corr_F_pval,BH_reject_F_HC3,BH_corr_F_pval_HC3
67,IRBO,EMA_08,2023,RET_to_AINI,,,,,,3,...,0.01546,-0.002024,0.118091,-0.051857,-0.048519,-0.355267,True,0.043862,True,0.034952
69,IRBO,normalized_AINI,2023,RET_to_AINI,,,,,,3,...,0.005089,-0.002338,0.007664,-0.049963,-0.057269,-0.468953,True,0.036396,True,0.034433
71,IRBO,normalized_AINI_z,2023,RET_to_AINI,,,,,,3,...,0.005089,0.006219,0.007664,-0.049963,-0.057269,-11.204443,True,0.036396,True,0.034433
193,META,EMA_02,2024,RET_to_AINI,,,,,,3,...,0.712997,-0.000691,0.796842,-0.004912,0.065924,-0.055914,True,0.074493,True,0.039652
195,META,EMA_08,2024,RET_to_AINI,,,,,,3,...,0.136715,-0.002507,0.26639,0.075379,0.130951,-0.205806,True,0.058394,True,0.039652
197,META,normalized_AINI,2024,RET_to_AINI,,,,,,3,...,0.065348,-0.003302,0.139316,0.104273,0.11681,-0.247523,True,0.058394,True,0.039652
199,META,normalized_AINI_z,2024,RET_to_AINI,,,,,,3,...,0.065348,0.01426,0.139316,0.104273,0.11681,-6.311515,True,0.058394,True,0.039652
202,MSFT,EMA_08,2024,AINI_to_RET,0.000452,-0.002896,0.040013,-0.074013,0.010912,3,...,0.0154,,,,,,True,0.094924,True,0.040364
204,MSFT,normalized_AINI,2024,AINI_to_RET,0.000443,-0.003908,0.028778,-0.053099,0.001327,3,...,0.01288,,,,,,True,0.094924,True,0.040364
206,MSFT,normalized_AINI_z,2024,AINI_to_RET,0.000581,-0.003908,0.001129,-0.002082,5.2e-05,3,...,0.01288,,,,,,True,0.094924,True,0.040364


In [4]:
labels = ["custom", "w0", "w1", "w2"]  # same order as dfs

# Collumns to drop in thesis-ready table
drop_cols = [
    "p_x","N_boot","N_obs","N_boot_valid","F_stat","df_den",
    "Original_F_pval","Empirical_F_pval","r2_u","BH_reject_F","BH_reject_F_HC3"
]

cleaned = []

# iterate over dfs to create subsets by direction
for name, df in zip(labels, dfs):
    d = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    id_cols = [c for c in ["Ticker","AINI_variant","Year","Direction"] if c in d.columns]

    a2r = d.loc[d["Direction"]=="AINI_to_RET", id_cols + [c for c in d.columns if c.startswith("A2R_beta_")]].copy()
    r2a = d.loc[d["Direction"]=="RET_to_AINI", id_cols + [c for c in d.columns if c.startswith("R2A_beta_")]].copy()
    
    # tag which df it came from
    a2r["Model"] = name   
    r2a["Model"] = name

    cleaned.append({"Model": name, "A2R": a2r, "R2A": r2a})

# Optional combined frames with the tag:
a2r_all = pd.concat([x["A2R"] for x in cleaned], ignore_index=True)
r2a_all = pd.concat([x["R2A"] for x in cleaned], ignore_index=True)
a2r_all_sort = a2r_all.sort_values(["Ticker","Year"])
a2r_all_sort

Unnamed: 0,Ticker,AINI_variant,Year,Direction,A2R_beta_const,A2R_beta_ret_lag1,A2R_beta_x_lag1,A2R_beta_x_lag2,A2R_beta_x_lag3,Model
4,AVGO,EMA_02,2023,AINI_to_RET,0.002681,0.03077,0.218301,-0.104089,-0.263429,w0
5,AVGO,EMA_02,2023_24,AINI_to_RET,0.00258,0.062223,0.162475,-0.060556,-0.322539,w0
6,AVGO,EMA_08,2023_24,AINI_to_RET,0.002994,0.07227,0.011157,-0.002122,-0.139067,w0
7,AVGO,normalized_AINI,2023_24,AINI_to_RET,0.00307,0.073981,-0.005333,-0.003751,-0.117296,w0
8,AVGO,normalized_AINI_z,2023_24,AINI_to_RET,0.002835,0.073981,-0.000197,-0.000138,-0.004329,w0
37,AVGO,EMA_02,2024_25,AINI_to_RET,0.001645,-0.020683,0.197126,-0.762745,0.529676,w2
38,AVGO,EMA_08,2024_25,AINI_to_RET,0.001619,-0.027177,0.068604,-0.133326,0.019298,w2
39,AVGO,normalized_AINI,2024_25,AINI_to_RET,0.001626,-0.02931,0.053,-0.089164,-0.008647,w2
40,AVGO,normalized_AINI_z,2024_25,AINI_to_RET,0.001699,-0.02931,0.002764,-0.00465,-0.000451,w2
20,IRBO,EMA_02,2024_25,AINI_to_RET,-0.000437,-0.074439,0.020509,-0.2156,0.126075,w1


Inspect results 

In [5]:
# rename for reporting
cols = a2r_all_sort.columns
rename_map = {}
for c in cols:
    if not isinstance(c, str):
        continue
    if c == "A2R_beta_const":
        rename_map[c] = "β₀"
        continue
    m = re.match(r"^A2R_beta_(x|ret)_lag(\d+)$", c)
    if m:
        kind, lag = m.groups()
        rename_map[c] = f"β_{kind}{lag}"
a2r_pretty = a2r_all_sort.rename(columns=rename_map)
a2r_pretty

Unnamed: 0,Ticker,AINI_variant,Year,Direction,β₀,β_ret1,β_x1,β_x2,β_x3,Model
4,AVGO,EMA_02,2023,AINI_to_RET,0.002681,0.03077,0.218301,-0.104089,-0.263429,w0
5,AVGO,EMA_02,2023_24,AINI_to_RET,0.00258,0.062223,0.162475,-0.060556,-0.322539,w0
6,AVGO,EMA_08,2023_24,AINI_to_RET,0.002994,0.07227,0.011157,-0.002122,-0.139067,w0
7,AVGO,normalized_AINI,2023_24,AINI_to_RET,0.00307,0.073981,-0.005333,-0.003751,-0.117296,w0
8,AVGO,normalized_AINI_z,2023_24,AINI_to_RET,0.002835,0.073981,-0.000197,-0.000138,-0.004329,w0
37,AVGO,EMA_02,2024_25,AINI_to_RET,0.001645,-0.020683,0.197126,-0.762745,0.529676,w2
38,AVGO,EMA_08,2024_25,AINI_to_RET,0.001619,-0.027177,0.068604,-0.133326,0.019298,w2
39,AVGO,normalized_AINI,2024_25,AINI_to_RET,0.001626,-0.02931,0.053,-0.089164,-0.008647,w2
40,AVGO,normalized_AINI_z,2024_25,AINI_to_RET,0.001699,-0.02931,0.002764,-0.00465,-0.000451,w2
20,IRBO,EMA_02,2024_25,AINI_to_RET,-0.000437,-0.074439,0.020509,-0.2156,0.126075,w1


Create reporting tables for thesis

In [6]:
# --- Find best AIC/BIC per (Ticker, Variable, Year) ---
idx_best_aic = gc_df_sign.groupby(["Ticker", "Variable", "Year"])["AIC"].idxmin()
idx_best_bic = gc_df_sign.groupby(["Ticker", "Variable", "Year"])["BIC"].idxmin()

gc_df_sign["best_aic_ticker_variable_year"] = gc_df_sign.index.isin(idx_best_aic)
gc_df_sign["best_bic_ticker_variable_year"] = gc_df_sign.index.isin(idx_best_bic)

# --- Find best AIC/BIC per (Ticker, Year) only (aggregate across all variables) ---
idx_best_aic_ty = gc_df_sign.groupby(["Ticker", "Year"])["AIC"].idxmin()
idx_best_bic_ty = gc_df_sign.groupby(["Ticker", "Year"])["BIC"].idxmin()

gc_df_sign["best_aic_ticker_year"] = gc_df_sign.index.isin(idx_best_aic_ty)
gc_df_sign["best_bic_ticker_year"] = gc_df_sign.index.isin(idx_best_bic_ty)

# --- Identify all p-value columns of AINI lags (excluding constant/intercept) ---
aini_p_cols = [col for col in gc_df_sign.columns if col.startswith("p_") and "const" not in col]

# --- Filter 1: best AIC or BIC by (Ticker, Variable, Year) + significant AINI lag ---
gc_df_best_tv_y = gc_df_sign[
    (gc_df_sign["best_aic_ticker_variable_year"] | gc_df_sign["best_bic_ticker_variable_year"]) &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Filter 2: best AIC or BIC by (Ticker, Year) + significant AINI lag ---
gc_df_best_t_y = gc_df_sign[
    (gc_df_sign["best_aic_ticker_year"] | gc_df_sign["best_bic_ticker_year"]) &
    (gc_df_sign[aini_p_cols] < 0.05).any(axis=1)
]

# --- Export Filter 1 (ticker-variable-year) to HTML/CSV ---
gc_df_best_tv_y.to_html(
    table_path / "granger_causality_best_aic_or_bic_ticker_variable_year.html", na_rep=''
)

# --- Export Filter 2 (ticker-year) to HTML/CSV ---
gc_df_best_t_y.to_html(
    table_path / "granger_causality_best_aic_or_bic_ticker_year.html", na_rep=''
)
gc_df_best_t_y.to_csv(
    var_path / "granger_causality_best_aic_or_bic_ticker_year.csv", index=False
)

# Optional preview
gc_df_best_t_y


NameError: name 'gc_df_sign' is not defined

re-perform Granger Causality analysis to include t-stats for lowest AIC/BIC variables

In [None]:
# group by year
gc_df_index = gc_df.set_index('Year')
gc_df.columns
gc_df_sort

In [None]:
# plot for different betas
lag_list = [f't-{i}' for i in range(1,21)]
for lag in lag_list:
    plot_aini_lags_by_year(gc_df_index, lag = lag)

In [None]:
for Year in set(gc_df.Year.values):
    plot_aini_lags_for_year(gc_df, Year)