Notebook to manually annotate WSJ articles.

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from IPython.display import display
import re
import sys
import os
from pathlib import Path

# Determine the project root
current_path = Path().resolve()
project_root = current_path.parents[0]  

# Define paths to annotation and cleaning modules
annotation_path = project_root / "src" / "annotation"
flagging_path = project_root / "src" / "preprocessing"
cleaning_path = project_root / "src" / "cleaning"

# Add to sys.path if not already present
for path in [annotation_path, cleaning_path, flagging_path]:
    path_str = str(path)
    if path_str not in sys.path:
        sys.path.append(path_str)

#  Imports modules
from label_articles import annotate_articles_with_hype as af, find_training_examples
from simple_ai_filter import flag_ai_mentions

In [None]:
# load samples
batch_1 = pd.read_csv(project_root / "data" / "interim" / "articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv(project_root / "data" / "interim" /  "articles_WSJ_batch_two.csv")
batch_3 = pd.read_csv(project_root / "data" / "interim" / "articles_WSJ_batch_three.csv")
batch_4 = pd.read_csv(project_root / "data" / "interim" / "articles_WSJ_batch_four.csv")
batch_list = [batch_1,batch_2,batch_3,batch_4]

# verify data
i = 1
for batch in batch_list:
    print(f"batch number {i} has {len(batch)} articles with {batch.corpus.isna().sum()} empty corpora")
    i += 1


Annotate articles using 0-1 scale for valence based index

In [None]:
# annotate first batch
first_Batch_articles_WSJ_author = af(df=batch_1)

In [None]:
# write the annotated sample to a csv file 
df_path = project_root / "data" / "interim" / "annotaeted_batches_valence" /  "articles_WSJ_batch_one_author.csv"
first_Batch_articles_WSJ_author.to_csv(df_path, index=False)  

In [None]:
# annotate second batch
df_second_batch_annotated_author = af(df=batch_2)

# save the second annotated sample to a csv file, ensure being in \notebooks
output_path = project_root / "data" / "interim" / "annotaeted_batches_valence" / "articles_WSJ_batch_two_author.csv"
df_second_batch_annotated_author.to_csv(output_path, index=False)

In [None]:
# annotate second batch
df_second_batch_annotated_author = af(df=batch_2)

# save the second annotated sample to a csv file, ensure being in \notebooks
output_path = project_root / "data" / "interim" / "annotaeted_batches_valence" /  "articles_WSJ_batch_two_author.csv"
df_second_batch_annotated_author.to_csv(output_path, index=False)

In [None]:
# annotate second batch
df_second_batch_annotated_author = af(df=batch_2)

# save the second annotated sample to a csv file, ensure being in \notebooks
output_path = project_root / "data" / "interim" / "annotaeted_batches_valence" /  "articles_WSJ_batch_two_author.csv"
df_second_batch_annotated_author.to_csv(output_path, index=False)

In [None]:
# annotate third batch
df_second_batch_annotated_author = af(df=batch_3)

# save the third annotated sample to a csv file, ensure being in \notebooks
output_path = project_root / "data" / "interim" / "annotaeted_batches_valence" / "articles_WSJ_batch_three_author.csv"
df_second_batch_annotated_author.to_csv(output_path, index=False)

In [None]:
# sample 25% at random from batch 4, reproducible with seed
df_fourth_batch_sample = batch_4.sample(frac=0.25, random_state=42).reset_index(drop=True)

# verify the sample size
print(f"Number of articles in the sample: {len(df_fourth_batch_sample)}")

In [None]:
# annotate the articles using the annotator function
df_fourth_batch_sample_annotated_author = af(df=df_fourth_batch_sample);

In [None]:
# write to csv file
df_fourth_batch_sample_annotated_author.to_csv(project_root / "data" / "annotaeted_batches_valence" /  "articles_WSJ_batch_four_subsample_author.csv", index=False)

Annotate articles for few-shot prompting & GPT-evaluation

In [2]:
# Define query
query = "SELECT * FROM article"

# Paths
db_path_23 = project_root / "data" / "processed" / "articles" / "articlesWSJ_clean_2023.db"
db_path_24 = project_root / "data" / "processed" / "articles" / "articlesWSJ_clean_2024.db"
db_path_25 = project_root / "data" / "processed" / "articles" / "articlesWSJ_clean_2025.db"

# Load data
with sqlite3.connect(db_path_23) as conn:
    data_23 = pd.read_sql(query, conn)

with sqlite3.connect(db_path_24) as conn:
    data_24 = pd.read_sql(query, conn)

with sqlite3.connect(db_path_25) as conn:
    data_25 = pd.read_sql(query, conn)


# concat
data = pd.concat([data_23,data_24,data_25])

# verify
print(data.columns)
print(len(data))


Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id', 'id', 'date', 'link', 'section',
       'cleaned_corpus'],
      dtype='object')
22904


In [None]:
# suffle with seed
subset = data.sample(frac=1, random_state=48).reset_index(drop=True)

# annotate first batch
few_shot_data = find_training_examples(df=data,context_window=4)

Extracting AI keyword context windows:   0%|          | 21/22904 [00:04<1:00:58,  6.25it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Extracting AI keyword context windows: 100%|██████████| 22904/22904 [1:10:00<00:00,  5.45it/s]



---
### Article 1/22904 — ID: `36`  
#### Title: Best Buy-Owned Phone Service Faces Angry Customers After 3G Network Shutdown
---

#### Text Snippet:

