Notebook to investigate custom FinBERTs annotations.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import re

In [2]:
# go to project root (ensure being in \notebooks)
project_root = Path().resolve().parents[0] 

# go to data folder
data = project_root / "data" / "processed" / "variables"

# load data
df_23 = pd.read_csv(data / "AIrel_prediction_2023.csv")
df_24 = pd.read_csv(data / "AIrel_prediction_2024.csv")

# Ensure 'date' column is in datetime format
df_23["date"] = pd.to_datetime(df_23["date"])
df_24["date"] = pd.to_datetime(df_24["date"])
df_24

Unnamed: 0,article_id,image_src,scanned_time,title,sub_title,corpus,index_id,id,date,link,section,cleaned_corpus,ai_window,predicted_label,predicted_class
0,13068,,2025-04-01 09:47:17,Baidu Terminates $3.6B Deal to Buy JOYY’s Chin...,"As of the end of December, the closing conditi...",Title: Baidu Terminates $3.6B Deal to Buy JOYY...,1,1,2024-01-01,https://www.wsj.com/business/telecom/baidu-ter...,business,"Jan. 1, 644 pm. ET 2 min. As of the end of. De...",,0,No narrative
1,13069,,2025-04-01 09:47:27,The Military’s Phantom ‘Extremists’,An independent study puts to rest another fals...,Title: The Military’s Phantom ‘Extremists’\n\n...,2,2,2024-01-01,https://www.wsj.com/opinion/military-extremist...,opinion,"REVIEW. OUTLOOK. Jan. 1, 545 pm. ET 834 3 min....",,0,No narrative
2,13070,,2025-04-01 09:47:37,Double Dipping in Opioid Lawsuits,OptumRx seeks to disqualify Motley Rice for a ...,Title: Double Dipping in Opioid Lawsuits\n\nAd...,3,3,2024-01-01,https://www.wsj.com/opinion/double-dipping-in-...,opinion,REVIEW. OUTLOOK. OptumRx seeks to disqualify. ...,,0,No narrative
3,13071,,2025-04-01 09:47:49,Xi Jinping Says Happy New Year,China’s leader tries to influence Taiwan’s Jan...,Title: Xi Jinping Says Happy New Year\n\nAdver...,4,4,2024-01-01,https://www.wsj.com/opinion/xi-jinping-says-ha...,opinion,REVIEW. OUTLOOK. China's leader tries to influ...,,0,No narrative
4,13072,,2025-04-01 09:47:59,"Israel Reshuffles Forces, Prepares for Long-Te...",Resisting pressure from U.S. to wind down the ...,"Title: Israel Reshuffles Forces, Prepares for ...",5,5,2024-01-01,https://www.wsj.com/world/middle-east/israel-r...,world,"Israel. Reshuffles. Forces,. Prepares for. Lon...",,0,No narrative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13420,37453,,2025-04-14 12:57:30,"Leonard Riggio, Who Built Barnes & Noble Into ...","A seminal figure in the industry, he transform...","Title: Leonard Riggio, Who Built Barnes & Nobl...",17531,17531,2024-08-27,https://www.wsj.com/business/entrepreneurship/...,business,"Leonard. Riggio,. Who. Built. Barnes. Noble. I...",,0,No narrative
13421,37457,,2025-04-14 12:58:23,Kamala Harris’s Tax Increases and Cuts Take Shape,"Taxes would go up for high-income households, ...",Title: Kamala Harris’s Tax Increases and Cuts ...,17535,17535,2024-08-27,https://www.wsj.com/politics/policy/kamala-har...,politics,Kamala. Harris's. Tax. Increases and. Cuts. Ta...,,0,No narrative
13422,37458,,2025-04-24 15:48:39,"With Arizona and Nevada Calls for Trump, Full ...",Republican president-elect wins 312 Electoral ...,Title: With Arizona and Nevada Calls for Trump...,27488,27488,2024-11-09,https://www.wsj.com/politics/elections/donald-...,politics,"With. Arizona and. Nevada. Calls for. Trump,. ...",,0,No narrative
13423,37459,,2025-04-24 15:48:50,Deals and Deterrence: Trump’s Foreign Policy i...,The president-elect is expected to showcase U....,Title: Deals and Deterrence: Trump’s Foreign P...,27489,27489,2024-11-09,https://www.wsj.com/politics/policy/trump-fore...,politics,Deals and. Deterrence. Trump's. Foreign. Polic...,,0,No narrative


In [None]:
# Create a 'year-month' column 
df_23["year_day"] = df_23["date"].dt.to_period("D")

# Count number of articles per month
daily_counts_23 = df_23.groupby("year_day")["article_id"].count().reset_index()

# Rename columns for clarity
daily_counts_23.columns = ["day", "article_count"]

# Convert period to string for display
daily_counts_23["day"] = daily_counts["day"].astype(str)

# convert article_count to int for plotting
daily_counts_23["article_count"] = daily_counts_23["article_count"].astype(int) 

daily_counts_23


In [None]:
# Create a 'year-month' column 
df_24["year_day"] = df_24["date"].dt.to_period("D")

# Count number of articles per month
daily_counts_24 = df_24.groupby("year_day")["article_id"].count().reset_index()

# Rename columns for clarity
daily_counts_24.columns = ["day", "article_count"]

# Convert period to string for display
daily_counts_24["day"] = daily_counts["day"].astype(str)

# convert article_count to int for plotting
daily_counts_24["article_count"] = daily_counts_24["article_count"].astype(int) 

daily_counts_24


In [None]:
# subset 2023 due to small samplesize prior April 2023
df_23_subset = df_23[df_23["date"] > pd.to_datetime("2023-03-30")]
df_unlabeled = pd.concat([df_23_subset, df_24], ignore_index=True)

# helper function
def preprocess(texts):
    text = " ".join(texts)
    return text.lower()

# Split by year
text_2023 = preprocess(df_unlabeled[df_unlabeled["date"].dt.year == 2023]["ai_window"].dropna())
text_2024 = preprocess(df_unlabeled[df_unlabeled["date"].dt.year == 2024]["ai_window"].dropna())

# create wordclouds
wc_2023 = WordCloud(width=800, height=400, background_color='white').generate(text_2023)
wc_2024 = WordCloud(width=800, height=400, background_color='white').generate(text_2024)

# plot
plt.figure(figsize=(16, 6))

plt.subplot(1, 2, 1)
plt.imshow(wc_2023, interpolation='bilinear')
plt.axis('off')
plt.title(" AI Narrative Snippets – 2023")

plt.subplot(1, 2, 2)
plt.imshow(wc_2024, interpolation='bilinear')
plt.axis('off')
plt.title("AI Narrative Snippets – 2024")

plt.tight_layout()
plt.savefig(project_root / "reports" / "figures"/ "wordcloud_FinBERT_2.png")
plt.show()


In [None]:
# ivnestigate means
print(df_23_subset.predicted_label.mean())
print(df_23.predicted_label.mean())
print(df_24.predicted_label.mean())

In [None]:
# Ensure 'day' columns are datetime
daily_counts_23["day"] = pd.to_datetime(daily_counts_23["day"], errors="coerce")
daily_counts_24["day"] = pd.to_datetime(daily_counts_24["day"], errors="coerce")

# === Calculate Summary Statistics ===

# 2023
mean_count_23 = daily_counts_23["article_count"].mean()
var_count_23 = daily_counts_23["article_count"].var()

# 2024
mean_count_24 = daily_counts_24["article_count"].mean()
var_count_24 = daily_counts_24["article_count"].var()

# === Plotting ===

# Create 2 stacked subplots, sharing y-axis
fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharey=True)

# === Plot for 2023 ===

# Plot article counts
axes[0].plot(
    daily_counts_23["day"],
    daily_counts_23["article_count"],
    color="blue"
)

# Draw mean as horizontal dashed line (excluded from legend)
axes[0].hlines(
    y=mean_count_23,
    xmin=daily_counts_23["day"].min(),
    xmax=daily_counts_23["day"].max(),
    color="red",
    linestyle="--",
    label="_nolegend_"
)

# Custom legend showing mean and variance
legend_elements_23 = [
    Line2D([0], [0], color='blue', label='Daily Counts 2023'),
    Line2D([0], [0], color='red', linestyle='--',
           label=f'Mean = {mean_count_23:.2f}, Var = {var_count_23:.2f}')
]
axes[0].legend(handles=legend_elements_23)

# Title and labels
axes[0].set_title("Daily Article Count - 2023")
axes[0].set_ylabel("Article Count")

# === Plot for 2024 ===

# Plot article counts
axes[1].plot(
    daily_counts_24["day"],
    daily_counts_24["article_count"],
    color="orange"
)

# Draw mean as horizontal dashed line (excluded from legend)
axes[1].hlines(
    y=mean_count_24,
    xmin=daily_counts_24["day"].min(),
    xmax=daily_counts_24["day"].max(),
    color="red",
    linestyle="--",
    label="_nolegend_"
)

# Custom legend showing mean and variance
legend_elements_24 = [
    Line2D([0], [0], color='orange', label='Daily Counts 2024'),
    Line2D([0], [0], color='red', linestyle='--',
           label=f'Mean = {mean_count_24:.2f}, Var = {var_count_24:.2f}')
]
axes[1].legend(handles=legend_elements_24)

# Title and labels
axes[1].set_title("Daily Article Count - 2024")
axes[1].set_xlabel("Date")
axes[1].set_ylabel("Article Count")

# === Final Layout ===
plt.tight_layout()
plt.savefig(project_root / "reports" / "figures" / "article_count_23_24.png" )
plt.show()
