Notebook used for initial analysis of the scraped WSJ articles.

In [None]:
import pandas as pd
import numpy as np
import os
import sqlite3
import re
import seaborn as sns   
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import sys
from pathlib import Path

# connect to directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
annotation_dir = os.path.join(parent_dir,"src","annotation")
flag_dir = os.path.join(parent_dir,"src","preprocessing")

if annotation_dir not in sys.path:
    sys.path.append(annotation_dir)

if flag_dir not in sys.path:
    sys.path.append(flag_dir)


# import the mentions ai function
from simple_ai_filter import flag_ai_mentions



In [None]:
# connect to subdirectory
db_path = os.path.join(parent_dir, "data", "processed", "articles","articlesWSJ_clean_2024.db")

# Connect to the database
conn = sqlite3.connect(db_path)

# explore table names
query = "SELECT name FROM sqlite_master WHERE type='table';"
table_names = pd.read_sql_query(query, conn)
print("Table names in the database:")
print(table_names)

df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# perform initial analysis
print("Number of rows in the DataFrame:", len(df))
print("Number of columns in the DataFrame:", len(df.columns))   
print("Columns in the DataFrame:", df.columns.tolist())

Flag articles which contain "AI, A.I., artificial intelligence, machine learning, deep learning, LLM, GPT, ChatGPT, OpenAI, transformer model or generative AI" (case-insensitive and uses word boundaries).  

In [None]:
# check for AI-related articles using importflag_ai_mentions from mentions_ai.py
df_labeled = flag_ai_mentions(df)

Analyze distribution by section and evolution over time

In [None]:
# fractioin of AI-related articles
print("Fraction of AI-related articles:", df_labeled["mentions_ai"].sum() / len(df_labeled))

# sections in the dataset
sections = df_labeled["section"].unique()
print("Sections in the dataset:", sections)

# create list of count for each section
section_counts = df_labeled["section"].value_counts()
print("Counts of articles in each section:", section_counts)

# crea a list of counts for each section by ai_relatedness
section_ai_counts = df_labeled.groupby(["section", "mentions_ai"]).size().unstack(fill_value=0)  
print("Counts of articles in each section by AI-relatedness:", section_ai_counts)


In [None]:
# normalize the daily counts by total counts of the given day
daily_counts_grouped = df_labeled.groupby(["date", "mentions_ai"]).size().unstack(fill_value=0).reset_index()
daily_counts_grouped["frac_ai"] = daily_counts_grouped[1] / (daily_counts_grouped[0] + daily_counts_grouped[1]) 
# 
print(daily_counts_grouped.head())

In [None]:
# v isualize the data using seaborn
# Set the style of seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6)) 

# Create the countplot
ax = sns.countplot(data=df_labeled, x="section", hue="mentions_ai", palette=["blue", "red"])

# Add bar labels
for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.text(
            p.get_x() + p.get_width() / 2.,  # x-position
            height + 1,                     # y-position 
            f'n={int(height)}',             # text label
            ha="center", va="bottom", fontsize=9
        )

# Final plot formatting
plt.title("AI-related Articles by Section in the WSJ (2024)")
plt.xlabel("Section")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="AI-related", loc="upper right", labels=["No", "Yes"])
plt.tight_layout()
fig_path = os.path.join(parent_dir,"reports","figures","WSJ2024_AIrel_articles_by_section.png")
plt.savefig(fig_path, dpi=300)
plt.show()


Next, the number of articles per day is analyzed

In [None]:
# group by day and count#
df_labeled['date'] = pd.to_datetime(df_labeled['date'])
daily_counts = df_labeled.groupby(df_labeled['date'].dt.date).size().reset_index(name='count')

# Show all rows in notebook or script
pd.set_option('display.max_rows', None)

# Filter and display
display(daily_counts[daily_counts['count'] < 25])

Investigation is performed over all years (2023-2025)

In [None]:
parent_dir

In [None]:
# define root
parent_dir = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().parent

# build file paths for all years dynamically
data_dir = parent_dir / "data" / "processed" / "variables"
files = [
    data_dir / f"FinBERT_AINI_prediction_{year}_windsize_1.csv"
    for year in (2023, 2024, 2025)
]

# load and concatenate
df_total = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

print(df_total.shape)
df_total.head()

In [None]:
# explore descriptives
print(set(df_total.section))

# drop prior 01/04/23
df_total["date"] = pd.to_datetime(df_total["date"])
df_total = df_total[df_total["date"] > pd.to_datetime("2023-03-31")]

# count total articles
df_total.groupby(df_total["date"].dt.year)["date"].count()



In [None]:
# print total number
print(df_total.shape[0])