Notebook to investigate raw FinBERT predictions

In [None]:
import pandas as pd
import sys
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import re
import itertools
import datetime as dt
import sqlite3

# connect to folder for custom functions
root = Path.cwd().parent
vis_path = root / "src" / "visualizations" 
mod_path = root / "src" / "modelling" 
sys.path.append(str(vis_path))
sys.path.append(str(mod_path))

from read_articles import read
#from plot_functions import plot_aini_series_subplots, plot_aini_hist_grid_by_years
from compute_extrema import compute_aini_extrema

In [None]:
# prepare paths
var_path = root / "data" / "processed" / "variables"
art_path = root / "data" / "processed" / "articles"
table_path = root / "reports" / "tables"
fig_path = root / "reports" / "figures"

# load aini data
aini_custom = pd.read_csv(var_path / "binary_AINI_variables.csv")
aini_w0 = pd.read_csv(var_path / "w0_AINI_variables.csv")
aini_w1 = pd.read_csv(var_path / "w1_AINI_variables.csv")
aini_w2 =  pd.read_csv(var_path / "w2_AINI_variables.csv")


# load financial data
fin = pd.read_csv(root / "data" / "raw" / "financial" / "full_daily_2023_2025.csv")
fin["Date"] = pd.to_datetime(fin["Date"])
fin

In [None]:
# create dataframes for visualizaions
normalized_AINI = pd.DataFrame()

# ensure sorting
aini_w0 = aini_w0.sort_values("date")
aini_w1 = aini_w1.sort_values("date")
aini_w2 = aini_w2.sort_values("date")
aini_custom = aini_custom.sort_values("date")

# compute extrema
merged, tidy, pivot, extrema = compute_aini_extrema(aini_w0,aini_w1,aini_w2,aini_custom)
aini_w1

In [None]:
# infer total obs per period

# ensure datetime
fin["date"] = pd.to_datetime(fin["Date"])

# reduce to unique trading dates
dates = fin["date"].drop_duplicates()

# min-max bounds
start_all = pd.Timestamp("2023-04-01")
end_all   = pd.Timestamp("2025-06-16")

# masks for periods
mask_all        = (dates >= start_all) & (dates <= end_all)
mask_2023       = (dates >= "2023-04-01") & (dates <= "2023-12-31")
mask_2024       = (dates.dt.year == 2024)
mask_2025       = (dates >= "2025-01-01") & (dates <= "2025-06-16")
mask_2023_2024  = (dates >= "2023-04-01") & (dates <= "2024-12-31")
mask_2024_2025  = (dates >= "2024-01-01") & (dates <= "2025-06-16")

# counts of unique dates
counts = {
    "All_2023-04-01_to_2025-06-16": dates.loc[mask_all].nunique(),
    "2023_after_03-31":             dates.loc[mask_2023].nunique(),
    "2024_full":                    dates.loc[mask_2024].nunique(),
    "2025_to_06-16":                dates.loc[mask_2025].nunique(),
    "Span_2023-2024":               dates.loc[mask_2023_2024].nunique(),
    "Span_2024-2025":               dates.loc[mask_2024_2025].nunique(),
}

print(pd.Series(counts, name="n_unique_dates"))

Explore differences in min, max, mean and std. of AINI variables

In [None]:
def summarize_columns(df, exclude=["date"]):
    """Return mean, std, min, max for each numeric column in df (except exclude)."""
    results = []
    for col in df.columns:
        if col not in exclude:
            series = df[col]
            results.append({
                "variable": col,
                "mean": series.mean(),
                "std": series.std(),
                "min": series.min(),
                "max": series.max()
            })
    return pd.DataFrame(results)

# usage
stats_individual = summarize_columns(merged)
print(stats_individual)

In [None]:
# plot distribution, ignoring raw counts due to unequal count of articles / day# 
outpath_hist = fig_path / "aini_hist_year_panels.png"

plot_aini_hist_grid_by_years(
    df = merged,
    outpath= outpath_hist
)


In [None]:
merged.columns


In [None]:
# write to csv
extrema.to_csv(table_path / "aini_extrema.csv")


In [None]:
# Convert to LaTeX with booktabs, tabular (single-page), wrapped in adjustbox
latex_table = extrema.to_latex(
    index=False,
    escape=True,
    column_format="l" + "c" * (len(extrema.columns) - 1),
    bold_rows=False
)

# Add booktabs spacing
latex_table = latex_table.replace("\\toprule", "\\toprule\n\\addlinespace")
latex_table = latex_table.replace("\\midrule", "\\midrule\n\\addlinespace")
latex_table = latex_table.replace("\\bottomrule", "\\addlinespace\n\\bottomrule")

# Wrap in table + adjustbox
latex_wrapped = (
    "\\begin{table}[!htbp]\n"
    "\\centering\n"
    "\\begin{adjustbox}{width=\\textwidth}\n"
    + latex_table +
    "\\end{adjustbox}\n"
    "\\caption{AINI extrema}\n"
    "\\label{tab:aini_extrema}\n"
    "\\end{table}\n"
)

# Save to file
output_path = table_path / "aini_extrema.tex"
with open(output_path, "w") as f:
    f.write(latex_wrapped)

Calculate weekly extrema (by calendar week)

In [None]:
# derive calendar week (ISO year + week number)
tidy["week"] = tidy["date"].dt.to_period("W").apply(lambda r: r.start_time)

# Count how often each min/max week occurs
counts_by_week = (
    tidy.groupby(["type", "week"])
        .size()
        .reset_index(name="count")
        .sort_values(["type", "count"], ascending=[True, False])
)

# subset n > 0
extrema_weekly = counts_by_week[counts_by_week["count"] > 0]

# collect variables for each week
week_dict = (
    tidy.groupby(["week"])["variable"]
    .apply(list)
    .to_dict()
)

# attach variables to each week
extrema_weekly["measure"] = extrema_weekly["week"].map(week_dict)

# bring into convenient format
extrema_weekly_clean = extrema_weekly.copy()

# week start (Monday)
week_start = extrema_weekly_clean["week"]
# week end (Sunday) = start + 6 days
week_end = week_start + pd.Timedelta(days=6)

# format as "dd.mm.yyyy - dd.mm.yyyy"
extrema_weekly_clean["week"] = (
    week_start.dt.strftime("%d.%m.%Y") + " - " + week_end.dt.strftime("%d.%m.%Y")
)

extrema_weekly_clean["type"] = extrema_weekly_clean["type"].replace({"min": "minimum", "max": "maximum"})
extrema_weekly_clean.rename(columns={"count": "n measures"}, inplace=True)

# save
extrema_weekly_clean.to_csv(table_path / "aini_weekly_extrema.csv", index=False)

In [None]:
# Convert to LaTeX with booktabs, tabular (single-page), wrapped in adjustbox
latex_table = extrema_weekly_clean.to_latex(
    index=False,
    escape=True,
    column_format="l" + "c" * (len(extrema_weekly_clean.columns) - 1),
    bold_rows=False
)

# Add booktabs spacing
latex_table = latex_table.replace("\\toprule", "\\toprule\n\\addlinespace")
latex_table = latex_table.replace("\\midrule", "\\midrule\n\\addlinespace")
latex_table = latex_table.replace("\\bottomrule", "\\addlinespace\n\\bottomrule")

# Wrap in table + adjustbox
latex_wrapped = (
    "\\begin{table}[!htbp]\n"
    "\\centering\n"
    "\\begin{adjustbox}{width=\\textwidth}\n"
    + latex_table +
    "\\end{adjustbox}\n"
    "\\caption{AINI extrema}\n"
    "\\label{tab:aini_extrema}\n"
    "\\end{table}\n"
)

# Save to file
output_path = table_path / "aini_extrema_weekly.tex"
with open(output_path, "w") as f:
    f.write(latex_wrapped)

In [None]:
extrema_weekly_clean.sort_values("n measures",ascending=False)

load data with AINI predictions & compare corpora with maxima and minima

In [None]:
# load & merge w1 data custom finbert data
c_df23 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2023_on_binary.csv")
c_df24 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2024_on_binary.csv")
c_df25 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2025_on_binary.csv")
c_df = pd.concat([c_df23,c_df24,c_df25]) 

# load & merge w0 data
w0_df23 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2023_windsize_0.csv")
w0_df24 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2024_windsize_0.csv")
w0_df25 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2025_windsize_0.csv")
w0_df = pd.concat([w0_df23,w0_df24,w0_df25]) 

# load & merge w1 data
w1_df23 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2023_windsize_1.csv")
w1_df24 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2024_windsize_1.csv")
w1_df25 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2025_windsize_1.csv")
w1_df = pd.concat([w1_df23,w1_df24,w1_df25]) 

# merge on normalized_aini_wo to identify relevant articles
w2_df23 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2023_windsize_2.csv")
w2_df24 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2024_windsize_2.csv")
w2_df25 = pd.read_csv(var_path / "FinBERT_AINI_prediction_2025_windsize_2.csv")
w2_df = pd.concat([w2_df23,w2_df24,w2_df25]) 

# create df list 
aini_dfs = [c_df,w0_df,w1_df,w2_df]


In [None]:
# verify integrity 
for df in aini_dfs:
    print(
        f"\nFirst entry (date): {df['date'].min()} "
        f"\nLast entry (date): {df['date'].max()} "
        f"\n# entries: {len(df)} "
        f"\n# non-unique article_id: {df['article_id'].duplicated().sum()} "
        f"\n# unique article_id: {df['article_id'].nunique()} " 
        f"\nMin article_id: {df['article_id'].min()} "
        f"\nMax article_id: {df['article_id'].max()} "
        f"\nColumns: {list(df.columns)}\n"
    )


In [None]:
# investigate duplicates in article ids
dups = w0_df[w0_df[["article_id", "title"]].duplicated(keep=False)].sort_values("article_id")
dups

In [None]:
# subset for relevant columns
rel_col = ["date","article_id","sentiment_label","hype_score"]
clean_df = []

for df in aini_dfs:
    df = df[rel_col].copy()
    clean_df.append(df)
    
# unpack supsetted dataframes containing Finbert AINI estimates    
c_df_sub,w0_df_sub,w1_df_sub,w2_df_sub = clean_df
w2_df_sub

In [None]:
# verify integrity 
for df in clean_df:
    print(
        f"First entry (date): {df['date'].min()} "
        f"\nLast entry (date): {df['date'].max()} "
        f"\n# entries: {len(df)} "
        f"\n# non-unique article_id: {df['article_id'].duplicated().sum()} "
        f"\nMin article_id: {df['article_id'].min()} "
        f"\nMax article_id: {df['article_id'].max()} "
        f"\nColumns: {list(df.columns)}\n"
    )


In [None]:
def summ(df, name):
    n = len(df)
    u = df['article_id'].nunique()
    dups = df['article_id'].duplicated().sum()
    print(f"{name:>3} rows={n:,}  unique_ids={u:,}  dup_ids={dups:,}")

summ(w0_df_sub, "w0")
summ(w1_df_sub, "w1")
summ(w2_df_sub, "w2")
summ(c_df_sub,  "c ")

ids = {
    "w0": set(w0_df_sub.article_id),
    "w1": set(w1_df_sub.article_id),
    "w2": set(w2_df_sub.article_id),
    "c" : set(c_df_sub.article_id),
}
union = set().union(*ids.values())
print("union unique ids:", len(union))
for k in ids:
    print(f"missing in {k}:", len(union - ids[k]))


Combine AINI predictions to investigate minima & maxima

In [None]:
# harmonize ids
w0 = w0_df_sub.copy()
w1 = w1_df_sub.copy()
w2 = w2_df_sub.copy()
c  = c_df_sub.copy()

# bring to datetime, normalize to 00:00:00
for df in (w0, w1, w2, c):
    df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.normalize()

# drop dubplicates
w0 = w0.sort_values(["article_id", "date"]).drop_duplicates("article_id", keep="last")
w1 = w1.sort_values(["article_id", "date"]).drop_duplicates("article_id", keep="last")
w2 = w2.sort_values(["article_id", "date"]).drop_duplicates("article_id", keep="last")
c  = c .sort_values(["article_id", "date"]).drop_duplicates("article_id", keep="last")

# left join
complete_left = w0.copy()
complete_left = complete_left.merge(
    w1, on="article_id", how="left", suffixes=("", "_w1"), validate="one_to_one"
)
complete_left = complete_left.merge(
    w2, on="article_id", how="left", suffixes=("", "_w2"), validate="one_to_one"
)
complete_left = complete_left.merge(
    c,  on="article_id", how="left", suffixes=("", "_c"),  validate="one_to_one"
)

# control: outer join
complete_outer = w0.merge(
    w1, on="article_id", how="outer", suffixes=("", "_w1"), validate="one_to_one"
)
complete_outer = complete_outer.merge(
    w2, on="article_id", how="outer", suffixes=("", "_w2"), validate="one_to_one"
)
complete_outer = complete_outer.merge(
    c,  on="article_id", how="outer", suffixes=("", "_c"),  validate="one_to_one"
)


# compare joins
print("[LEFT] n observations:", len(complete_left))
print("[OUTER] n observations:", len(complete_outer))

In [None]:
# columns to bring over from w1_df
keep = ["article_id", "title", "sub_title","section", "cleaned_corpus", "date"]

# subset + clean IDs on the right side
for_texts = w1_df.loc[:, keep].copy()
for_texts["article_id"] = (
    for_texts["article_id"]
    .astype(str)
    .str.replace(r"[\u200b\u200c\u200d\ufeff]", "", regex=True)  # zero-width + BOM
    .str.strip()
)

# ensure right side has unique keys for a clean one-to-one merge
# (keep the first occurrence; adjust if you prefer 'last')
for_texts = for_texts.drop_duplicates(subset="article_id", keep="first")

# clean IDs on the left side too (same normalization)
complete_left = complete_left.copy()
complete_left["article_id"] = (
    complete_left["article_id"]
    .astype(str)
    .str.replace(r"[\u200b\u200c\u200d\ufeff]", "", regex=True)
    .str.strip()
)

# merge (left join), bring in *_t suffix to avoid column collisions
complete_df = complete_left.merge(
    for_texts,
    how="left",
    on="article_id",
    suffixes=("", "_t"),
    validate="one_to_one",  # will raise if either side still has duplicate keys
)

print("[COMPLETE] n observations:", len(complete_df))
complete_df

In [None]:
extrema.sort_values("n measures",ascending=False).to_csv(var_path/ "extrema.csv")

In [None]:
# define minima
min_1 = pd.Timestamp("2025-02-05") # n=7
min_2a = pd.Timestamp("2025-07-15") # n=2
min_2b = pd.Timestamp("2025-02-04") # n=2

# define maxima, 
max_1 = pd.Timestamp("2023-04-01") # n=4
max_2a = pd.Timestamp("2025-02-13")  # n=3
max_2b = pd.Timestamp("2025-09-14")  # n=3
max_2c = pd.Timestamp("2025-06-21")  # n=3

# subset original data extrema, minima
articles_min1 = complete_df[complete_df["date"] == min_1] 
articles_min2a = complete_df[complete_df["date"] == min_2a]
articles_min2b = complete_df[complete_df["date"] == min_2b]

# subset original data extrema, maxmima
articles_max1 = complete_df[complete_df["date"] == max_1] 
articles_max2a = complete_df[complete_df["date"] == max_2a] 
articles_max2b = complete_df[complete_df["date"] == max_2b] 
articles_max2c = complete_df[complete_df["date"] == max_2c] 

# investigate structure
articles_min1

Ensure article count does not bias AINI

In [None]:
# Ensure both are datetime 
merged["date"] = pd.to_datetime(merged["date"]).dt.normalize()
n_per_day = (complete_df[["article_id", "date"]]
             .assign(date=lambda df: pd.to_datetime(df["date"]).dt.normalize())
             .groupby("date")
             .count()
             .rename(columns={"article_id": "n_articles"}))  

# merge
aini_article_count = merged.merge(
    n_per_day,
    on="date",
    how="left"
)

# calculate correlation between n articles & AINI variables
for col in aini_article_count.columns:
    if col not in ["date", "n_articles"]:
        corr = aini_article_count["n_articles"].corr(aini_article_count[col])
        print(f"{col}: {corr:.3f}")


Manually investiagte Minima & Maxima

In [None]:
read(articles_min1)

In [None]:
# drop noisy estimates
merged_clean = merged[articles_min1] # too low samplesize + extrema
merged_clean

In [None]:
# investigate min 1
read(merged[merged["date"] == min_1])

In [None]:
# investigate 01.04.2023; max 2 according to normalized_AINI_custom, simple_AINI_custom, EMA_02_custom, EMA_08_custom
articles_max1

In [None]:
# drop noisy estimates
merged_clean = merged[merged["date"] != min_1]

# ensure datetime type
complete_df["date"] = pd.to_datetime(complete_df["date"], errors="coerce")

# subsets per year
df_2023 = complete_df[complete_df["date"].dt.year == 2023]
df_2024 = complete_df[complete_df["date"].dt.year == 2024]
df_2025 = complete_df[complete_df["date"].dt.year == 2025]

merged_clean
merged_clean["normalized_AINI_w2"].min()

Investigate AINI by year

In [None]:
plot_aini_series_subplots(merged_clean)