Notebook used to inspect results of Granger Causality analysis

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import math
import sys
from pathlib import Path
import re

# Get the project root: notebooks/AI_narrative_index
root_dir = Path.cwd().parent

# Add needed folders to the Python modules search path
sys.path.append(str(root_dir / "src" / "scripts"))
sys.path.append(str(root_dir / "src" / "visualizations"))
sys.path.append(str(root_dir / "src" / "modelling"))

# import custom functions
#rom plot_granger_causality import plot_aini_lags_by_year, plot_aini_lags_for_year
from plot_functions import plot_n_articles_with_extrema_events, plot_stock_growth
from construct_tables import df_to_pptx 
from compute_rejections import compute_rejection_rates_all, add_trading_days_columns, export_rejection_rates_to_pptx_all_tables_only

In [28]:
# define path to variables
var_path = root_dir / "data" / "processed" / "variables"
 
# load data (without VIX)
gc_c = pd.read_csv(var_path / "granger_causality_binary.csv")
gc_w0 = pd.read_csv(var_path / "granger_causality_w0.csv")
gc_w1 = pd.read_csv(var_path / "granger_causality_w1.csv")
gc_w2 = pd.read_csv(var_path / "granger_causality_w2.csv")

# merge them together
gc_all_results = pd.concat([gc_c, gc_w0, gc_w1, gc_w2], ignore_index=True)

# save merged results
gc_all_results.to_csv(var_path / "granger_causality_all.csv", index=False)

# define table path
table_path = root_dir / "reports" / "tables"

# Export as HTML for online appendix
gc_c.to_html(table_path / "granger_causality_custom_model.html", index=False)
gc_w0.to_html(table_path / "granger_causality_w0.html", index=False)
gc_w1.to_html(table_path / "granger_causality_w1.html", index=False)
gc_w2.to_html(table_path / "granger_causality_w2.html", index=False)
gc_all_results

In [29]:
# infer total obs per period

# load financial data (limiting factor)
fin = pd.read_csv(root_dir / "data" / "raw" / "financial" / "full_daily_2023_2025.csv")


# ensure datetime
fin["date"] = pd.to_datetime(fin["Date"])

# reduce to unique trading dates
dates = fin["date"].drop_duplicates()

# min-max bounds
start_all = pd.Timestamp("2023-04-01")
end_all   = pd.Timestamp("2025-06-16")

# masks for periods
mask_all        = (dates >= start_all) & (dates <= end_all)
mask_2023       = (dates >= "2023-04-01") & (dates <= "2023-12-31")
mask_2024       = (dates.dt.year == 2024)
mask_2025       = (dates >= "2025-01-01") & (dates <= "2025-06-16")
mask_2023_2024  = (dates >= "2023-04-01") & (dates <= "2024-12-31")
mask_2024_2025  = (dates >= "2024-01-01") & (dates <= "2025-06-16")

# counts of unique dates
counts = {
    "All_2023-04-01_to_2025-06-16": dates.loc[mask_all].nunique(),
    "2023_after_03-31":             dates.loc[mask_2023].nunique(),
    "2024_full":                    dates.loc[mask_2024].nunique(),
    "2025_to_06-16":                dates.loc[mask_2025].nunique(),
    "Span_2023-2024":               dates.loc[mask_2023_2024].nunique(),
    "Span_2024-2025":               dates.loc[mask_2024_2025].nunique(),
}

print(pd.Series(counts, name="n_unique_dates"))

In [30]:
# Focus only on AINI → Returns direction
a2r = gc_all_results[gc_all_results["Direction"].str.contains("AINI_to_RET", case=False)].copy()

# Make sure Year and Ticker are strings
a2r["Year"] = a2r["Year"].astype(str)
a2r["Ticker"] = a2r["Ticker"].astype(str)

# Convert bootstrap rejection flag to boolean
a2r["rej_bh_boot"] = a2r["BH_reject_F"].astype(bool)
a2r["rej_bh_hc"] = a2r["BH_reject_F_HC3"].astype(bool)

# Both-method significance = true if both are true
a2r["rej_both"] = a2r["rej_bh_boot"] & a2r["rej_bh_hc"]

# Total number of models tested
total = a2r["rej_both"].count()

# Number of rejections (both bootstrap + HC3 significant)
n_reject = a2r["rej_both"].sum()

# Rejection rate
rejection_rate = n_reject / total * 100

print(f"Total models: {total}")
print(f"Both-method rejections: {n_reject}")
print(f"Rejection rate: {rejection_rate:.2f}%")


In [31]:
# Focus only on returns → AINI direction
r2a = gc_all_results[gc_all_results["Direction"].str.contains("RET_to_AINI", case=False)].copy()

# Make sure Year and Ticker are strings
r2a["Year"] = r2a["Year"].astype(str)
r2a["Ticker"] = r2a["Ticker"].astype(str)

# Convert bootstrap rejection flag to boolean
r2a["rej_bh_boot"] = r2a["BH_reject_F"].astype(bool)
r2a["rej_bh_hc"] = r2a["BH_reject_F_HC3"].astype(bool)

# Both-method significance = true if both are true
r2a["rej_both"] = r2a["rej_bh_boot"] & r2a["rej_bh_hc"]

# Total number of models tested
total = r2a["rej_both"].count()

# Number of rejections (both bootstrap + HC3 significant)
n_reject = r2a["rej_both"].sum()

# Rejection rate
rejection_rate = n_reject / total * 100

print(f"Total models: {total}")
print(f"Both-method rejections: {n_reject}")
print(f"Rejection rate: {rejection_rate:.2f}%")


In [32]:

# Calculate rejection rate by Year
by_year = (
    a2r.groupby("Year")["rej_both"]
       .agg(["sum","count"])
       .assign(rate=lambda x: 100 * x["sum"] / x["count"])
       .sort_values("rate", ascending=False)
)

print("By Year (both-method rejection rate):")
print(by_year)

# By Ticker (both-method rejection rate)
by_ticker = (
    a2r.groupby("Ticker")["rej_both"]
       .agg(["sum","count"])
       .assign(rate=lambda x: 100 * x["sum"] / x["count"])
       .sort_values("rate", ascending=False)
)

print("\nBy Ticker (both-method rejection rate):")
print(by_ticker.head(8))

for idx, row in by_year.iterrows():
    print(f"{idx}: {row['rate']:.2f}%")

for idx, row in by_ticker.head(8).iterrows():
    print(f"{idx}: {row['rate']:.2f}%")

rejection_rate

In [33]:
# subset for p < y 0.1
alpha = 0.1

gc_c_sub = gc_c[
    (gc_c["BH_corr_F_pval"] < alpha) | (gc_c["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w0_sub = gc_w0[
    (gc_w0["BH_corr_F_pval"] < alpha) | (gc_w0["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w1_sub = gc_w1[
    (gc_w1["BH_corr_F_pval"] < alpha) | (gc_w1["BH_corr_F_pval_HC3"] < alpha)
].copy()

gc_w2_sub = gc_w2[
    (gc_w2["BH_corr_F_pval"] < alpha) | (gc_w2["BH_corr_F_pval_HC3"] < alpha)
].copy()

dfs = [gc_c_sub,gc_w0_sub,gc_w1_sub,gc_w2_sub]
gc_w1_sub.columns

In [34]:
labels = ["custom", "w0", "w1", "w2"]  # same order as dfs

# Collumns to drop in thesis-ready table
drop_cols = [
    "p_x","N_boot","N_obs","N_boot_valid","F_stat","df_den",
    "Original_F_pval","Empirical_F_pval","r2_u","BH_reject_F","BH_reject_F_HC3"
]

cleaned = []

# iterate over dfs to create subsets by direction
for name, df in zip(labels, dfs):
    d = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    id_cols = [c for c in ["Ticker","AINI_variant","Year","Direction","BH_corr_F_pval","BH_corr_F_pval_HC3","adj_r2_u"] if c in d.columns]

    a2r = d.loc[d["Direction"]=="AINI_to_RET", id_cols + [c for c in d.columns if c.startswith("A2R_beta_")]].copy()
    r2a = d.loc[d["Direction"]=="RET_to_AINI", id_cols + [c for c in d.columns if c.startswith("R2A_beta_")]].copy()
    
    # tag which df it came from
    a2r["Model"] = name   
    r2a["Model"] = name

    cleaned.append({"Model": name, "A2R": a2r, "R2A": r2a})

# combined frames with the tag:
a2r_all = pd.concat([x["A2R"] for x in cleaned], ignore_index=True)
r2a_all = pd.concat([x["R2A"] for x in cleaned], ignore_index=True)
a2r_all_sort = a2r_all.sort_values(["Ticker","Year"])

In [35]:
# investigate main findings
a2r_all.sort_values("adj_r2_u",ascending=False)


In [36]:
# investigate potential dependency r2 & beta
a2r_all["adj_r2_u"].corr(a2r_all["A2R_beta_ret_lag1"])

In [37]:
model_group_tickers = (
    a2r_all_sort
    .groupby(["Ticker", "Year"])
    .size()
    .reset_index(name="n_measures")
)

model_group_tickers["Ticker"] = model_group_tickers["Ticker"].replace({"TSM": "TSMC"})
model_group_tickers["Year"] = model_group_tickers["Year"].replace({"2023_24": "2023-2024"})
model_group_tickers["Year"] = model_group_tickers["Year"].replace({"2024_25": "2024-2025"})
model_group_tickers["Year"] = model_group_tickers["Year"].replace({"2023_24_25": "2023-2025"})


plt.figure(figsize=(14, 7), dpi=300)
ax = sns.barplot(data=model_group_tickers, x="Ticker", y="n_measures", hue="Year", dodge=True)
ax.set_title("Number of Significant Measures per Ticker and Year", fontsize=14)
ax.set_xlabel("Ticker", fontsize=12)
ax.set_ylabel("n_measures", fontsize=12)
ax.tick_params(axis="x", rotation=90)
ax.legend(title="Year", fontsize=10, title_fontsize=11, loc="best")
plt.tight_layout()

out_path = root_dir / "reports" / "figures" / "significant_measures_year_ticker_nocontrol_gc.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close()
a2r_all_sort

In [38]:
model_group_variant = (
    a2r_all_sort
    .groupby(["AINI_variant", "Year"])
    .size()
    .reset_index(name="n_variants")
)

model_group_variant["Year"] = model_group_variant["Year"].replace({"2023_24": "2023-2024"})
model_group_variant["Year"] = model_group_variant["Year"].replace({"2024_25": "2024-2025"})
model_group_variant["Year"] = model_group_variant["Year"].replace({"2023_24_25": "2023-2025"})

plt.figure(figsize=(14, 7), dpi=300)
ax = sns.barplot(data=model_group_variant, x="AINI_variant", y="n_variants", hue="Year", dodge=True)
ax.set_title("Number of Significant AINI Variants per Year", fontsize=14)
ax.set_xlabel("Ticker", fontsize=12)
ax.set_ylabel("n_measures", fontsize=12)
ax.tick_params(axis="x", rotation=90)
ax.legend(title="Year", fontsize=10, title_fontsize=11, loc="best")
plt.tight_layout()

out_path = root_dir / "reports" / "figures" / "significant_measures_year_variant_nocontrol_gc.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close()
model_group_variant

In [39]:
# find relevant tickers
relevant_tickers = set(model_group_tickers.Ticker)

# find all tickers
all_tickers = set(gc_w0.Ticker)

# ticker without significant results
ins_tickers = all_tickers - relevant_tickers
ins_tickers

In [40]:
# clean for reporting

# rename 
rename_dict = {
    "AINI_variant": "AINI Variant",
    "BH_corr_F_pval": "BH-corr. F (Bootstrap)",
    "BH_corr_F_pval_HC3": "BH-corr. F (Analytic HC3)",
    "adj_r2_u": "Adj. R²",
    "A2R_beta_ret_lag1": "β₁",
    "A2R_beta_x_lag1": "γ₁",
    "A2R_beta_x_lag2": "γ₂",
    "A2R_beta_x_lag3": "γ₃",
}

a2r_all_sort = a2r_all_sort.rename(columns=rename_dict)

# drop Direction
a2r_all_sort = a2r_all_sort.drop(columns=["Direction"], errors="ignore")

# final reporting order
order = [
    "Model",
    "Ticker",
    "AINI Variant",
    "Year",
    "β₁", "γ₁",
    "γ₂",
    "γ₃",
    "BH-corr. F (Bootstrap)",
    "BH-corr. F (Analytic HC3)",
    "Adj. R²",
]

a2r_all_sort = a2r_all_sort[order]


Controlled for VIX

In [41]:
# load data, controlled for log growth of the VIX
gc_c_VIX = pd.read_csv(var_path / "granger_causality_log_growth_VIX_binary.csv")
gc_w0_VIX = pd.read_csv(var_path / "granger_causality_log_growth_VIX_w0.csv")
gc_w1_VIX = pd.read_csv(var_path / "granger_causality_log_growth_VIX_w1.csv")
gc_w2_VIX = pd.read_csv(var_path / "granger_causality_log_growth_VIX_w2.csv")

# concat
gc_all_results_VIX = pd.concat([gc_c_VIX,gc_w0_VIX,gc_w1_VIX,gc_w2_VIX], ignore_index=True)

# save merged results
gc_all_results_VIX.to_csv(var_path / "granger_causality_VIX_all.csv", index=False)

# Export as HTML for online appendix
gc_c_VIX.to_html(table_path / "granger_causality_VIX_custom_model.html", index=False)
gc_w0_VIX.to_html(table_path / "granger_causality_VIX_w0.html", index=False)
gc_w1_VIX.to_html(table_path / "granger_causality_VIX_w1.html", index=False)
gc_w2_VIX.to_html(table_path / "granger_causality_VIX_w2.html", index=False)
gc_c_VIX.columns

In [42]:
# Focus only on AINI → Returns direction
a2r_VIX = gc_all_results_VIX[gc_all_results_VIX["Direction"].str.contains("AINI_to_RET", case=False)].copy()

# Make sure Year and Ticker are strings
a2r_VIX["Year"] = a2r_VIX["Year"].astype(str)
a2r_VIX["Ticker"] = a2r_VIX["Ticker"].astype(str)

# Convert bootstrap rejection flag to boolean
a2r_VIX["rej_bh_boot"] = a2r_VIX["BH_reject_F"].astype(bool)
a2r_VIX["rej_bh_hc"] = a2r_VIX["BH_reject_F_HC3"].astype(bool)

# Both-method significance = true if both are true
a2r_VIX["rej_both"] = a2r_VIX["rej_bh_boot"] & a2r_VIX["rej_bh_hc"]

# Total number of models tested
total = a2r_VIX["rej_both"].count()

# Number of rejections (both bootstrap + HC3 significant)
n_reject = a2r_VIX["rej_both"].sum()

# Rejection rate
rejection_rate = n_reject / total * 100

print(f"Total models: {total}")
print(f"Both-method rejections: {n_reject}")
print(f"Rejection rate: {rejection_rate:.2f}%")


Create PPT for rejection rates

In [43]:
# All methods (boot, HC3, both), tables only + Trading days
rates = compute_rejection_rates_all(gc_all_results_VIX, direction_substr="AINI_to_RET")
rates_with_days = add_trading_days_columns(rates)

ppt = export_rejection_rates_to_pptx_all_tables_only(
    rates_with_days,
    outpath=table_path / "rejection_rates_all_methods_VIX.pptx",
    top_tickers=15
)
print("Saved PPT:", ppt)


In [44]:
# extract all aini -> return Ticker, Year pairs
gc_all_results_VIX_a2r = gc_all_results_VIX[gc_all_results_VIX["Direction"] == "AINI_to_RET"]
gc_all_results_VIX_a2r

In [45]:
labels = ["custom", "w0", "w1", "w2"]  # same order as dfs

# Collumns to drop in thesis-ready table
drop_cols = [
    "p_x","N_boot","N_obs","N_boot_valid","F_stat","df_den",
    "Original_F_pval","Empirical_F_pval","r2_u","BH_reject_F","BH_reject_F_HC3"
]

cleaned = []

# iterate over dfs to create subsets by direction
for name, df in zip(labels, dfs):
    d = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    id_cols = [c for c in ["Ticker","AINI_variant","Year","Direction","BH_corr_F_pval","BH_corr_F_pval_HC3","adj_r2_u"] if c in d.columns]

    a2r_VIX = d.loc[d["Direction"]=="AINI_to_RET", id_cols + [c for c in d.columns if c.startswith("A2R_beta_")]].copy()
    r2a_VIX = d.loc[d["Direction"]=="RET_to_AINI", id_cols + [c for c in d.columns if c.startswith("R2A_beta_")]].copy()
    
    # tag which df it came from
    a2r_VIX["Model"] = name   
    r2a_VIX["Model"] = name

    cleaned.append({"Model": name, "A2R": a2r_VIX, "R2A": r2a_VIX})

# combined frames with the tag:
a2r_all_VIX = pd.concat([x["A2R"] for x in cleaned], ignore_index=True)
r2a_all_VIX = pd.concat([x["R2A"] for x in cleaned], ignore_index=True)
a2r_all_sort_VIX = a2r_all.sort_values(["Ticker","Year"])

In [46]:
# Investigate correlation of R2 & betas
a2r_all_sort_VIX.sort_values("adj_r2_u",ascending=True)

In [47]:
a2r_all_sort_VIX.adj_r2_u.corr(a2r_all_sort_VIX.A2R_beta_ret_lag1)

In [48]:
# subset for p < y 0.1
alpha = 0.1

gc_c_VIX_sub = gc_c_VIX[(gc_c_VIX["BH_corr_F_pval"] < alpha) | (gc_c_VIX["BH_corr_F_pval_HC3"] < alpha)].copy()
gc_w0_VIX_sub = gc_w0_VIX[(gc_w0_VIX["BH_corr_F_pval"] < alpha) | (gc_w0_VIX["BH_corr_F_pval_HC3"] < alpha)].copy()
gc_w1_VIX_sub = gc_w1_VIX[(gc_w1_VIX["BH_corr_F_pval"] < alpha) | (gc_w1_VIX["BH_corr_F_pval_HC3"] < alpha)].copy()
gc_w2_VIX_sub = gc_w2_VIX[(gc_w2_VIX["BH_corr_F_pval"] < alpha) | (gc_w2_VIX["BH_corr_F_pval_HC3"] < alpha)].copy()

dfs_VIX = [gc_c_VIX_sub, gc_w0_VIX_sub, gc_w1_VIX_sub, gc_w2_VIX_sub]

In [49]:
labels = ["custom", "w0", "w1", "w2"]  

# Columns to drop in thesis-ready table
drop_cols = [
    "p_x","N_boot","N_obs","N_boot_valid","F_stat","df_den",
    "Original_F_pval","Empirical_F_pval","r2_u","BH_reject_F","BH_reject_F_HC3"
]

cleaned_VIX = []

# iterate over dfs_VIX to create subsets by direction
for name, df in zip(labels, dfs_VIX):
    d = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    id_cols = [c for c in [
        "Ticker","AINI_variant","Year","Direction",
        "BH_corr_F_pval","BH_corr_F_pval_HC3","adj_r2_u"
    ] if c in d.columns]

    a2r_VIX = d.loc[d["Direction"]=="AINI_to_RET", id_cols + [c for c in d.columns if c.startswith("A2R_beta_")]].copy()
    r2a_VIX = d.loc[d["Direction"]=="RET_to_AINI", id_cols + [c for c in d.columns if c.startswith("R2A_beta_")]].copy()
    
    # tag which df it came from
    a2r_VIX["Model"] = name
    r2a_VIX["Model"] = name

    cleaned_VIX.append({"Model": name, "A2R": a2r_VIX, "R2A": r2a_VIX})

# Optional combined frames with the tag:
a2r_all_VIX = pd.concat([x["A2R"] for x in cleaned_VIX], ignore_index=True)
r2a_all_VIX = pd.concat([x["R2A"] for x in cleaned_VIX], ignore_index=True)

# sort for readability
a2r_all_sort_VIX = a2r_all_VIX.sort_values(["Ticker","Year"])
r2a_all_sort_VIX = r2a_all_VIX.sort_values(["Ticker","Year"])

# check columns
print(a2r_all_sort_VIX.columns)
a2r_all_sort_VIX

In [50]:
# investigate insignificant aini / year pairs

# make sure Year is string or int in both
a2r_all_sort_VIX["Year"] = a2r_all_sort_VIX["Year"].astype(str)
gc_all_results_VIX_a2r["Year"] = gc_all_results_VIX_a2r["Year"].astype(str)

# extract pairs from each df
pairs_a2r = set(zip(a2r_all_sort_VIX["Ticker"], a2r_all_sort_VIX["Year"]))
pairs_gc  = set(zip(gc_all_results_VIX_a2r["Ticker"], gc_all_results_VIX_a2r["Year"]))

# intersection
no_significance = pairs_gc - pairs_a2r
a2r_all_sort_VIX

In [51]:
model_group_tickers_VIX = (
    a2r_all_sort_VIX
    .groupby(["Ticker", "Year"])
    .size()
    .reset_index(name="n_measures")
)

model_group_tickers_VIX["Ticker"] = model_group_tickers_VIX["Ticker"].replace({"TSM": "TSMC"})
model_group_tickers_VIX["Year"] = model_group_tickers_VIX["Year"].replace({"2023_24": "2023-2024"})
model_group_tickers_VIX["Year"] = model_group_tickers_VIX["Year"].replace({"2024_25": "2024-2025"})
model_group_tickers_VIX["Year"] = model_group_tickers_VIX["Year"].replace({"2023_24_25": "2023-2025"})


plt.figure(figsize=(14, 7), dpi=300)
ax = sns.barplot(data=model_group_tickers_VIX, x="Ticker", y="n_measures", hue="Year", dodge=True)
ax.set_title("Number of significant measures (α = 0.1, both methods, ℓ = 1,2,3) per ticker and year (max = 48)", fontsize=14)
ax.set_xlabel("Ticker", fontsize=12)
ax.set_ylabel("n_measures", fontsize=12)
ax.tick_params(axis="x", rotation=90)
ax.legend(title="Year", fontsize=10, title_fontsize=11, loc="best")
plt.tight_layout()

out_path = root_dir / "reports" / "figures" / "significant_measures_year_ticker_VIX_gc.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.show()
model_group_tickers_VIX

In [None]:
if "AINI_variant" in a2r_all_sort_VIX.columns:
    a2r_all_sort_VIX = a2r_all_sort_VIX.rename({"AINI_variant":"AINI Variant"}
)
model_group_variant_VIX = (
    a2r_all_sort_VIX
    .groupby(["AINI Variant", "Year"])
    .size()
    .reset_index(name="n_variants")
)

model_group_variant_VIX["Year"] = model_group_variant_VIX["Year"].replace({"2023_24": "2023-2024"})
model_group_variant_VIX["Year"] = model_group_variant_VIX["Year"].replace({"2024_25": "2024-2025"})
model_group_variant_VIX["Year"] = model_group_variant_VIX["Year"].replace({"2023_24_25": "2023-2025"})

plt.figure(figsize=(14, 7), dpi=300)
ax = sns.barplot(data=model_group_variant_VIX, x="AINI Variant", y="n_variants", hue="Year", dodge=True)
ax.set_title("Number of significant AINI variants (α = 0.1, both methods, ℓ = 1,2,3) by year (max = 45)", fontsize=14)
ax.set_xlabel("", fontsize=12)
ax.set_ylabel("n_measures", fontsize=12)
ax.tick_params(axis="x", rotation=0)
ax.legend(title="Year", fontsize=10, title_fontsize=11, loc="best")
plt.tight_layout()

out_path = root_dir / "reports" / "figures" / "significant_measures_year_variant_VIX_gc.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# clean for reporting

# rename 
rename_dict = {
    "AINI_variant": "AINI Variant",
    "BH_corr_F_pval": "BH-corr. F (Bootstrap)",
    "BH_corr_F_pval_HC3": "BH-corr. F (Analytic HC3)",
    "adj_r2_u": "Adj. R²",
    "A2R_beta_ret_1": "β₁",
    "A2R_beta_x_1": "γ₁",
    "A2R_beta_ret_2": "β₂",
    "A2R_beta_x_2": "γ₂",
    "A2R_beta_ret_3": "β₃",
    "A2R_beta_x_3": "γ₃",
}

a2r_all_sort_VIX = a2r_all_sort_VIX.rename(columns=rename_dict)

# drop Direction
a2r_all_sort_VIX = a2r_all_sort_VIX.drop(columns=["Direction"], errors="ignore")

# final reporting order
order = [
    "Model",
    "Ticker",
    "AINI Variant",
    "Year",
    "β₁", "β₂", "β₃",
    "γ₁", "γ₂", "γ₃",
    "BH-corr. F (Bootstrap)",
    "BH-corr. F (Analytic HC3)",
    "Adj. R²",
]

# order columns, sort by abs. beta 1
a2r_all_sort_VIX = a2r_all_sort_VIX[order].sort_values(
    by="γ₁",
    key=lambda col: col.abs(),
    ascending=False
)

In [None]:
# round to 4 decimal
cols = ['β₁', 'β₂', 'β₃', 'γ₁', 'γ₂','γ₃','BH-corr. F (Bootstrap)', 'BH-corr. F (Analytic HC3)', 'Adj. R²']
a2r_all_sort_VIX_round = a2r_all_sort_VIX.copy()
a2r_all_sort_VIX_round[cols] = a2r_all_sort_VIX[cols].apply(lambda x: x.round(4))

# beautify
a2r_all_sort_VIX_round["Year"].replace({"_":"-"},inplace=True)
a2r_all_sort_VIX_round.columns

In [None]:
# --- DataFrame ---
df = a2r_all_sort_VIX_round.copy()

# sort tickers for readability (by median absolute β₁)
ticker_order = (
    df.groupby("Ticker")["γ₁"]
      .apply(lambda x: x.abs().median())
      .sort_values(ascending=False)
      .index
      .tolist()
)
df["Ticker"] = pd.Categorical(df["Ticker"], categories=ticker_order, ordered=True)
df = df.sort_values(["Ticker", "Model"])

# map each Model to a marker shape
markers = ['o','s','^','D','P','X','v','<','>']
model_list = df["Model"].unique().tolist()
model_to_marker = {m: markers[i % len(markers)] for i, m in enumerate(model_list)}

# y positions with small offsets so multiple models per ticker don’t overlap
base_y = df["Ticker"].cat.codes.values.astype(float)
offsets = np.linspace(-0.25, 0.25, num=len(model_list)) if len(model_list) > 1 else [0.0]
model_to_offset = {m: offsets[i] for i, m in enumerate(model_list)}
y_pos = base_y + df["Model"].map(model_to_offset).values

# --- Plot ---
fig, ax = plt.subplots(figsize=(10, max(4, 0.35*len(ticker_order)+1)))
ax.axvline(0, linestyle="--", color="gray", linewidth=1)

for m in model_list:
    sub = df[df["Model"] == m]
    ax.scatter(
        sub["γ₁"], 
        sub["Ticker"].cat.codes + model_to_offset[m],
        marker=model_to_marker[m],
        s=70,
        label=m
    )

ax.set_yticks(np.arange(len(ticker_order)))
ax.set_yticklabels(ticker_order)
ax.set_xlabel(r"$\gamma_1$")
ax.set_ylabel("Ticker")
ax.set_title(r"$\gamma_1$ by Ticker for significant results (α = 0.1, both methods), ℓ = 1,2,3, all periods")
ax.legend(title="Model", frameon=False)
plt.tight_layout()
plt.savefig(root_dir / "reports" / "figures" / f"y1_by_ticker_model_all_periods.png")
plt.show()
a2r_all_sort_VIX_round


In [None]:
df = a2r_all_sort_VIX_round.copy()

gamma_cols = ['γ₁','γ₂','γ₃']
df[gamma_cols] = df[gamma_cols].apply(pd.to_numeric, errors='coerce')

EPS = 0.0   # set e.g. 1e-12 to treat near-zero as zero
DROP_ZEROS = False  # single switch for both overall and by-ticker

def _sign_with_eps(x: pd.Series, eps: float = EPS) -> pd.Series:
    z = x.copy()
    z = z.where(~z.between(-eps, eps), 0.0)
    return np.sign(z)

def flip_stats(a: pd.Series, b: pd.Series, drop_zeros: bool = DROP_ZEROS) -> pd.DataFrame:
    s1, s2 = _sign_with_eps(a), _sign_with_eps(b)
    valid = (~a.isna()) & (~b.isna())
    if drop_zeros:
        valid &= (s1 != 0) & (s2 != 0)
    flips = (s1 * s2 < 0) & valid
    n_flips = int(flips.sum())
    n_valid = int(valid.sum())
    rate = n_flips / n_valid if n_valid > 0 else np.nan
    return pd.DataFrame({'pair':[f'{a.name} vs {b.name}'],
                         'n_flips':[n_flips],
                         'n_valid':[n_valid],
                         'flip_rate':[rate]})

# --- Overall counts (consistent with DROP_ZEROS) ---
overall = pd.concat([
    flip_stats(df['γ₁'], df['γ₂']),
    flip_stats(df['γ₁'], df['γ₃']),
    flip_stats(df['γ₂'], df['γ₃'])
], ignore_index=True)

label = "strict, zeros excluded" if DROP_ZEROS else "zeros included"
print(f"Overall sign flips ({label}):")
print(overall)

# --- By Ticker (consistent with DROP_ZEROS) ---
def flip_by_group(group, a_name, b_name, drop_zeros: bool = DROP_ZEROS):
    a, b = group[a_name], group[b_name]
    s1, s2 = _sign_with_eps(a), _sign_with_eps(b)
    valid = (~a.isna()) & (~b.isna())
    if drop_zeros:
        valid &= (s1 != 0) & (s2 != 0)
    flips = (s1 * s2 < 0) & valid
    return pd.Series({
        'n_flips': int(flips.sum()),
        'n_valid': int(valid.sum()),
        'flip_rate': (int(flips.sum()) / int(valid.sum())) if int(valid.sum()) > 0 else np.nan
    })

by_ticker = []
for a, b in [('γ₁','γ₂')]:  # add ('γ₁','γ₃'), ('γ₂','γ₃') if needed
    out = (df.groupby('Ticker', dropna=False)
             .apply(lambda g: flip_by_group(g, a, b, DROP_ZEROS))
             .reset_index())
    out.insert(1, 'pair', f'{a} vs {b}')
    by_ticker.append(out)
by_ticker = pd.concat(by_ticker, ignore_index=True)

print(f"\nSign flips by Ticker ({label}):")
print(by_ticker)



In [None]:
# identify major peaks
majors = list()
regressors = ['γ₁','γ₂','γ₃']

majors = a2r_all_sort_VIX_round[
    (a2r_all_sort_VIX_round[regressors] > 0.15) |
    (a2r_all_sort_VIX_round[regressors] < -0.15)
].dropna(how="all", subset=regressors)

majors_idx = majors.index
majors_df = a2r_all_sort_VIX_round.iloc[majors_idx]

frag_large = majors_df.shape[0] /  a2r_all_sort_VIX_round.shape[0]

#rows with at least one regressor < -0.15
neg = a2r_all_sort_VIX_round.loc[
    (a2r_all_sort_VIX_round[regressors] < -0.15).any(axis=1),
    ["AINI Variant", "Year", "Ticker"] + regressors
].drop_duplicates()

# rows with at least one regressor > 0.15
pos = a2r_all_sort_VIX_round.loc[
    (a2r_all_sort_VIX_round[regressors] > 0.15).any(axis=1),
    ["AINI Variant", "Year", "Ticker"] + regressors
].drop_duplicates()

print("Negative extremes (< -0.15):")
print(neg.to_string(index=False))

print("\nPositive extremes (> 0.15):")
print(pos.to_string(index=False))
print(f"\Total of extrema {majors.shape[0]}")

In [None]:
# --- DataFrame ---
df = a2r_all_sort_VIX_round.copy()

# sort tickers for readability (by median absolute β₁)
ticker_order = (
    df.groupby("Ticker")["β₁"]
      .apply(lambda x: x.abs().median())
      .sort_values(ascending=False)
      .index
      .tolist()
)
df["Ticker"] = pd.Categorical(df["Ticker"], categories=ticker_order, ordered=True)
df = df.sort_values(["Ticker", "Model"])

# map each Model to a marker shape
markers = ['o','s','^','D','P','X','v','<','>']
model_list = df["Model"].unique().tolist()
model_to_marker = {m: markers[i % len(markers)] for i, m in enumerate(model_list)}

# y positions with small offsets so multiple models per ticker don’t overlap
base_y = df["Ticker"].cat.codes.values.astype(float)
offsets = np.linspace(-0.25, 0.25, num=len(model_list)) if len(model_list) > 1 else [0.0]
model_to_offset = {m: offsets[i] for i, m in enumerate(model_list)}
y_pos = base_y + df["Model"].map(model_to_offset).values

# --- Plot ---
fig, ax = plt.subplots(figsize=(10, max(4, 0.35*len(ticker_order)+1)))
ax.axvline(0, linestyle="--", color="gray", linewidth=1)

for m in model_list:
    sub = df[df["Model"] == m]
    ax.scatter(
        sub["β₁"], 
        sub["Ticker"].cat.codes + model_to_offset[m],
        marker=model_to_marker[m],
        s=70,
        label=m
    )

ax.set_yticks(np.arange(len(ticker_order)))
ax.set_yticklabels(ticker_order)
ax.set_xlabel(r"$\beta_1$")
ax.set_ylabel("Ticker")
ax.set_title(r"$\beta_1$ by Ticker (markers = Model)")
ax.legend(title="Model", frameon=False)
plt.tight_layout()
plt.show()


In [None]:
base = a2r_all_sort_VIX_round.copy()

# ensure numeric γ₁ (for ordering)
base["γ₁"] = pd.to_numeric(base["γ₁"], errors="coerce")

for yr in sorted(base["Year"].dropna().unique()):
    df = base[base["Year"] == yr].copy()

    # sort tickers by median |γ₁| for readability
    ticker_order = (
        df.groupby("Ticker")["γ₁"]
          .apply(lambda x: x.abs().median())
          .sort_values(ascending=False)
          .index
          .tolist()
    )

    # categorical order
    df["Ticker"] = pd.Categorical(df["Ticker"], categories=ticker_order, ordered=True)
    df = df.sort_values(["Ticker", "Model"])

    # map Model -> marker
    markers = ['o','s','^','D','P','X','v','<','>']
    model_list = df["Model"].dropna().unique().tolist()
    model_to_marker = {m: markers[i % len(markers)] for i, m in enumerate(model_list)}

    # y offsets so models for same ticker don't overlap
    offsets = np.linspace(-0.25, 0.25, num=len(model_list)) if len(model_list) > 1 else [0.0]
    model_to_offset = {m: offsets[i] for i, m in enumerate(model_list)}

    # plot per year
    fig, ax = plt.subplots(figsize=(10, max(4, 0.35*len(ticker_order)+1)))
    ax.axvline(0, linestyle="--", color="gray", linewidth=1)


    # ensure markers
    for m in model_list:
        sub = df[df["Model"] == m]
        ax.scatter(
            sub["γ₁"],
            sub["Ticker"].cat.codes + model_to_offset[m],
            marker=model_to_marker[m],
            s=70,
            label=m
        )

    # axes, labels, legend outside the model loop
    ax.set_yticks(np.arange(len(ticker_order)))
    ax.set_yticklabels(ticker_order)
    ax.set_xlabel(r"$\gamma_1$")
    ax.set_ylabel("Ticker")
    ax.set_title(rf"$\gamma_1$ by Ticker (markers = Model), Year = {yr}")
    ax.legend(title="Model", frameon=False)
    plt.tight_layout()

    # save ONCE per year
    out_path = root_dir / "reports" / "figures" / f"year_{yr}_gc_vix_size_by_ticker.png"
    plt.savefig(out_path, dpi=300)
    plt.close(fig)
    print(f"Saved figure: {out_path}")


In [None]:
ppt_path = df_to_pptx(
    a2r_all_sort_VIX_round,
    outpath=table_path / "Granger_causality_AINI_VIX.pptx",
    rows_per_slide=7
)
print("Saved PPT:", ppt_path)

In [None]:
rounds_col = [ 'β₁', 'β₂', 'β₃', 'γ₁', 'γ₂','γ₃', 'BH-corr. F (Bootstrap)', 'BH-corr. F (Analytic HC3)', 'Adj. R²']
majors_df_round = majors_df.copy()
majors_df_round[rounds_col] = majors_df[rounds_col].round(4)

ppt_path = df_to_pptx(
    majors_df_round,
    outpath=table_path / "Granger_causality_AINI_VIX_large_magnitude_results.pptx",
    rows_per_slide=7
)
print("Saved PPT:", ppt_path)

In [None]:
# investigate 
majors_df_round.groupby("Ticker").count()

In [None]:
# plot stock prices 
tickers = [
    "AAPL","AIQ","AMD","AMZN","ARKQ",
    "AVGO","BOTZ","GOOGL","IRBO","META",
    "MSFT","NVDA","ROBO","TSLA","TSM"
]
start = pd.Timestamp("2023-04-01")
end   = pd.Timestamp("2025-06-16")

# ensure datetime 
fin["date"] = pd.to_datetime(fin["date"])

# filter by date and ticker
mask = (fin["date"] >= start) & (fin["date"] <= end)
d = fin.loc[mask, ["date","Ticker","Adj Close"]].copy()

# safety: drop rows without price
d = d.dropna(subset=["Adj Close"])

# compute global y-limits to keep same y-scale across all plots
ymin = d["Adj Close"].min()
ymax = d["Adj Close"].max()
pad = 0.03 * (ymax - ymin) if ymax > ymin else 1.0
ylims = (ymin - pad, ymax + pad)

# sort tickers consistently and split into 3 groups of 5
tickers_sorted = sorted(tickers)
groups = [tickers_sorted[i:i+5] for i in range(0, len(tickers_sorted), 5)]

# define fig_path
fig_path = root_dir / "reports" / "figures"

# make one figure per group
for gi, grp in enumerate(groups, start=1):
    fig, ax = plt.subplots(figsize=(10, 5))
    for t in grp:
        sub = d[d["Ticker"] == t].sort_values("date")
        if sub.empty:
            continue
        ax.plot(sub["date"], sub["Adj Close"], linewidth=1.8, label=t)
        ax.set_title(f"Adj Close — Group {gi}: {', '.join(grp)}")
        ax.set_xlabel("Date")
        ax.set_ylabel("Adj Close")
        ax.set_xlim(start, end)   # same x-scale
        ax.set_ylim(*ylims)       # same y-scale
        ax.legend(ncol=3, frameon=False, loc="upper left", bbox_to_anchor=(0, 1.02))
        fig.autofmt_xdate()
        plt.tight_layout()
        fig.savefig( fig_path /f"adj_close_group_{gi}.png", dpi=300, bbox_inches="tight")

    plt.show()


In [None]:
plot_n_articles_with_extrema_eventstickers = ["AAPL","AIQ","AMD","AMZN","ARKQ","AVGO","BOTZ","GOOGL","IRBO","META",
           "MSFT","NVDA","ROBO","TSLA","TSM"]

# df is your DataFrame with the columns you listed
plot_stock_growth(
    fin,
    tickers=plot_n_articles_with_extrema_eventstickers,
    start="2023-04-01",
    end="2025-06-15",
    group_size=5,
    base=100.0,
    save_dir=fig_path,   
    show=True
)

Controlled for number of articles

Create reporting tables for thesis

re-perform Granger Causality analysis to include t-stats for lowest AIC/BIC variables