In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Determine the project root
current_path = Path().resolve()
project_root = current_path.parents[0]  


In [None]:
# Determine the project root
current_path = Path().resolve()
project_root = current_path.parents[0]  

# Define data path
var_path = project_root / "data" / "processed" / "variables"
art_path =  project_root / "data" / "processed" / "articles"

# load labeled subset
man_data = pd.read_csv(art_path / "annotated_subsample_WSJ_final.csv")
dict_labels = pd.read_csv(art_path / "naive_AI_labels_2023-2025.csv")    


# subset for 2024
dict_labels["date"] = pd.to_datetime(dict_labels["date"])
dict_labels = dict_labels[dict_labels["date"].dt.year == 2024]
dict_labels

In [None]:
# transform hype level
man_data["hype_level"] = man_data["hype_level"].replace([2,3],1).astype(int)

# ensure datatype in label collumn
dict_labels["hype_level"] = dict_labels["about_ai"].astype(int) 

# subset dicts
dict_labels_sub = dict_labels[dict_labels["article_id"].isin(man_data["article_id"])]
print(len(dict_labels_sub))

# fraction of ai-related articles 
print(
    f"total: {man_data['hype_level'].sum()} | "
    f"relative: {man_data['hype_level'].sum() / man_data['hype_level'].shape[0]:.3f}"
)


In [None]:
merged = man_data[["article_id", "hype_level"]].merge(
    dict_labels_sub[["article_id", "hype_level"]],
    on="article_id",
    suffixes=("_man", "_dict")
)

agree_total = (merged["hype_level_man"] == merged["hype_level_dict"]).sum()
agree_total
agree_frac = agree_total / 1018
agree_frac

In [None]:
# load FinBert-annotated binary data
fin_data_23 = pd.read_csv(var_path / "FinBERT_binary_prediction_2023.csv")
fin_data_24 = pd.read_csv(var_path / "FinBERT_binary_prediction_2024.csv")
fin_data_25 = pd.read_csv(var_path / "FinBERT_binary_prediction_2025.csv")

#concat finbert
fin_data = pd.concat([fin_data_23,fin_data_24,fin_data_25])

In [None]:
# Load all three prediction outputs
df23 = pd.read_csv(var_path / "FinBERT_binary_prediction_2023.csv")
df24 = pd.read_csv(var_path / "FinBERT_binary_prediction_2024.csv")
df25 = pd.read_csv(var_path / "FinBERT_binary_prediction_2025.csv")

# Ensure article_id is a string and prefix with year
df23["article_id"] = "2023" + df23["article_id"].astype(str)
df24["article_id"] = "2024" + df24["article_id"].astype(str)
df25["article_id"] = "2025" + df25["article_id"].astype(str)

# Combine and check duplicates
df_all = pd.concat([df23.assign(year=2023), df24.assign(year=2024), df25.assign(year=2025)])
dupes = df_all[df_all.duplicated("article_id", keep=False)]

print(f"Duplicate article_ids across years: {dupes.article_id.nunique()}")
print(dupes[["article_id", "year", "title"]].sort_values("article_id").head(10))#



In [None]:
# Basic info
print("Basic Overview of df_all")
print(f"Total rows: {len(df_all)}")
print(f"Unique article_ids: {df_all['article_id'].nunique()}")
print(f"Duplicate article_ids: {(df_all['article_id'].duplicated()).sum()}")

# Date range
print("Date range:")
print(f"  Min date: {df_all['date'].min()}")
print(f"  Max date: {df_all['date'].max()}")

# Overall mean
print("Mean predicted_label (overall):", round(df_all["predicted_label"].mean(), 3))

# Mean by year
print("Mean predicted_label by year:")
print(df_all.groupby("year")["predicted_label"].mean().round(3))

# Count by year
print("Article count by year:")
print(df_all["year"].value_counts().sort_index())



In [None]:
# finbert inference of AI-relatedness
print(f"Total: {df_all['predicted_label'].sum()} |"
      f"Relative: {df_all['predicted_label'].sum() / df_all.shape[0]}")

In [None]:
# load naively labeled date 
naive_df = pd.read_csv(art_path / "naive_AI_labels_2023-2025.csv")

# Ensure date is parsed as datetime
naive_df["date"] = pd.to_datetime(naive_df["date"])

# Prefix year to article_id
naive_df["article_id"] = naive_df["date"].dt.year.astype(str) + naive_df["article_id"].astype(str)

In [None]:
merged_fin_naive = df_all[["article_id", "predicted_label"]].merge(
    naive_df[["article_id", "about_ai"]],
    on="article_id",
    how="inner"
)

agreement = (merged_fin_naive["predicted_label"].astype(int) == merged_fin_naive["about_ai"].astype(int)).mean()
print("Agreement rate on matched article_ids:", round(agreement, 3))
len(merged_fin_naive)
naive_df