Notebook to compare binary vs. three-class classification

In [1]:
# import libraries
import pandas as pd 
import numpy as np
from IPython.display import display, Markdown
import sys
from pathlib import Path

In [11]:
# Set root and target dir
project_root = Path.cwd().parent
data_dir = project_root / "data" 
articles_dir = data_dir / "interim" 

# Define columns to keep (in lowercase for consistency)
target_columns = [
    'article_id', 'title','hype_level'
]

# List of author-annotated CSV files
filenames_author = [
    "articles_WSJ_batch_one_author.csv",
    "articles_WSJ_batch_two_author.csv",
    "articles_WSJ_batch_three_author.csv",
    "articles_WSJ_batch_four_subsample_author.csv"
]

# Initialize empty DataFrame for author annotations
df_author = pd.DataFrame(columns=target_columns)

# Loop through each author file
for csv in filenames_author:
    path = articles_dir / csv

    # Read CSV normally
    df = pd.read_csv(path)

    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()

    # Keep only relevant columns
    subset = df[target_columns].copy()

    # Append to cumulative author DataFrame
    df_author = pd.concat([df_author, subset], ignore_index=True)

# List of annotator-labeled CSV files
filenames_annotator = [
    "articles_WSJ_batch_one_annotator.csv",
    "articles_WSJ_batch_two_annotator.csv",
    "articles_WSJ_batch_three_annotator.csv",
    "articles_WSJ_batch_four_annotator.csv"
]

# Initialize empty DataFrame for annotator annotations
df_annotator = pd.DataFrame(columns=target_columns)

# Loop through each annotator file and handle encoding issues
for csv in filenames_annotator:
    path = articles_dir / csv

    try:
        # Attempt UTF-8 encoding
        df = pd.read_csv(path, encoding='utf-8')
    except UnicodeDecodeError:
        # Fallback to cp1252 encoding (common Windows encoding)
        df = pd.read_csv(path, encoding='cp1252')

    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()

    # Keep only relevant columns
    subset = df[target_columns].copy()

    # Append to cumulative annotator DataFrame
    df_annotator = pd.concat([df_annotator, subset], ignore_index=True)
print(set(df_annotator.hype_level))

  df_author = pd.concat([df_author, subset], ignore_index=True)


{0, 1, 2, 3}


In [12]:
df_merged = df_author.merge(df_annotator, on="article_id", suffixes=("_auth", "_ann"))

# verify that 493 are in the overlapping df
print(318 + 1/4 * 700 == len(df_merged))

# verify values in hype  columns
print("Unique values in hype_level_ann:")
print(set(df_merged['hype_level_ann'].dropna()))

print("\nUnique values in hype_level_auth:")
print(set(df_merged['hype_level_auth'].dropna()))

True
Unique values in hype_level_ann:
{0, 1, 2, 3}

Unique values in hype_level_auth:
{0.0, 1.0, 2.0}


In [13]:
# ensure consistency
df_merged["hype_level_ann"] = df_merged["hype_level_ann"].fillna(0).replace(3,2).astype(int)
df_merged["hype_level_auth"] = df_merged["hype_level_auth"].fillna(0).astype(int)

  df_merged["hype_level_ann"] = df_merged["hype_level_ann"].fillna(0).replace(3,2).astype(int)


Compare three-class annotation

In [14]:
# find total divergence
n_dif_three = np.sum(df_merged.hype_level_auth != df_merged.hype_level_ann)
n_dif_three_rel = n_dif_three / len(df_merged)

# show results
print(f'There are {n_dif_three} aritcles with different hype-levels, which approximiates {round(n_dif_three_rel,2)*100}%')

There are 111 aritcles with different hype-levels, which approximiates 23.0%


Compare binary annotation

In [None]:
# Transform hype scores into binary format
df_merged["hype_level_ann"] = df_merged["hype_level_ann"].fillna(0).replace([3, 2], 1).astype(int)
df_merged["hype_level_auth"] = df_merged["hype_level_auth"].fillna(0).replace(2, 1).astype(int)

# Count divergences between annotator and author
n_dif_three_bin = np.sum(df_merged.hype_level_auth != df_merged.hype_level_ann)
n_dif_three_rel_bin = n_dif_three_bin / len(df_merged)

# Display result as percentage
print(f'There are {n_dif_three_bin} articles with different hype levels, which is approximately {round(n_dif_three_rel_bin * 100, 2)}%.')


There are 69 articles with different hype levels, which is approximately 14.0%.


Compare binary annatoation with results of dictionary-based Finbert

In [25]:
# load binary predictions for 2024 
fin_bin_pred = pd.read_csv(data_dir / "processed" / "variables" / "FinBERT_binary_prediction_2024.csv")

#verify load
print(fin_bin_pred.head())

   article_id  image_src         scanned_time  \
0       13068        NaN  2025-04-01 09:47:17   
1       13069        NaN  2025-04-01 09:47:27   
2       13070        NaN  2025-04-01 09:47:37   
3       13071        NaN  2025-04-01 09:47:49   
4       13072        NaN  2025-04-01 09:47:59   

                                               title  \
0  Baidu Terminates $3.6B Deal to Buy JOYY’s Chin...   
1                The Military’s Phantom ‘Extremists’   
2                  Double Dipping in Opioid Lawsuits   
3                     Xi Jinping Says Happy New Year   
4  Israel Reshuffles Forces, Prepares for Long-Te...   

                                           sub_title  \
0  As of the end of December, the closing conditi...   
1  An independent study puts to rest another fals...   
2  OptumRx seeks to disqualify Motley Rice for a ...   
3  China’s leader tries to influence Taiwan’s Jan...   
4  Resisting pressure from U.S. to wind down the ...   

                               

In [28]:
# subset for overlaps with annotated df
df_merged["article_id"] = df_merged["article_id"].astype(int)
fin_bin_pred["article_id"] = fin_bin_pred["article_id"].astype(int)
fin_sub = fin_bin_pred[fin_bin_pred["article_id"].isin(df_merged["article_id"])]

fin_sub

Unnamed: 0,article_id,image_src,scanned_time,title,sub_title,corpus,index_id,id,date,link,section,cleaned_corpus,ai_window,predicted_label,predicted_class
