Notebook used to create samples for manual annotation of the WSJ articles 

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from IPython.display import display
import re
import sys
import os
from pathlib import Path

# Determine the project root (assumes Wd is /notebooks)
current_path = Path().resolve()
project_root = current_path.parents[0]  

# Define paths for custom functions
flagging_path = project_root / "src" / "preprocessing"
cleaning_path = project_root / "src" / "cleaning"

# Add to sys.path if not already present
for path in [ cleaning_path, flagging_path]:
    path_str = str(path)
    if path_str not in sys.path:
        sys.path.append(path_str)

#  Imports modules
from simple_ai_filter import flag_ai_mentions

Preparing the First Sample (n=500)

In [5]:
# Define the path to the cleaned 2024 database
db_path = project_root / "data" / "processed" / "articles" / "articlesWSJ_clean_2024.db"
print(db_path)
# read sql file in dataframe
conn = sqlite3.connect(db_path) 
df = pd.read_sql_query("SELECT * FROM article", conn)

# close the connection
conn.close() 

#inspect colums 
print(df.columns)

# insect first 5 rows of the dataframe
print(df.head()) 

# check for duplicates 
print(f"There are {df.duplicated().any()} duplicates in the dataframe")

# check for duplicates in article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

 # check for null values in corpus
print(f"There are {df['corpus'].isnull().sum()} null values in the corpus column") 

# check for empty strings in corpus
print(f"There are {(df['corpus'] == '').sum()} empty strings in the corpus column")

# check number of articles
print(f"There are {df['article_id'].nunique()} unique articles in the dataframe")

# veryfy uniquenes of article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

C:\Users\PC\Desktop\Masterarbeit\AI_narrative_index\data\processed\articles\articlesWSJ_clean_2024.db
Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id', 'id', 'date', 'link', 'section',
       'cleaned_corpus'],
      dtype='object')
   article_id image_src         scanned_time  \
0       13068            2025-04-01 09:47:17   
1       13069            2025-04-01 09:47:27   
2       13070            2025-04-01 09:47:37   
3       13071            2025-04-01 09:47:49   
4       13072            2025-04-01 09:47:59   

                                               title  \
0  Baidu Terminates $3.6B Deal to Buy JOYY’s Chin...   
1                The Military’s Phantom ‘Extremists’   
2                  Double Dipping in Opioid Lawsuits   
3                     Xi Jinping Says Happy New Year   
4  Israel Reshuffles Forces, Prepares for Long-Te...   

                                           sub_title  \
0  As of the end of December, the 

Constructing Initial Subsample (n = 500)

In [None]:
# sample 500 random article from the dataframe to build annotation data set
df_sample_large = df.sample(500, random_state=42)

In [None]:
# save the cleaned sample to a csv file
df_path = project_root / "data" / "interim" / "articles_WSJ_sub500.csv"
df_sample_large.to_csv(df_path, index=False)

AI-Related Article Filter 

Uses flag_ai_mentions() to detect AI keywords (AI, A.I., artificial intelligence, machine learning, deep learning, LLM, GPT, ChatGPT, OpenAI, transformer model, generative AI, neural network).  
Matching is case-insensitive with word boundaries to avoid false positives.  
Ensures each batch contains relevant AI content.  

In [None]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
df_sample_large = flag_ai_mentions(df_sample_large)

Sample with seed 42 for annotated examples

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 2 AI-related articles and 1 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(4, random_state=42)
sample_non_ai = non_ai_articles.sample(1, random_state=42)

# Combine into one DataFrame
df_three_articles = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
df_three_articles[['article_id', 'title', 'corpus', 'mentions_ai']]


Investigate 5 flagged and an unflagged article to discuss with the annotator

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 3 AI-related articles and 2 non-AI article (reproducible with seed)
sample_ai_non_ann = ai_articles.sample(4, random_state=41)
sample_non_ai_non_ann = non_ai_articles.sample(1, random_state=41)

# Combine into one DataFrame
df_non_ann = pd.concat([sample_ai_non_ann, sample_non_ai_non_ann]).reset_index(drop=True).drop(columns=['mentions_ai'])

# show columns of the dataframe
print(df_non_ann.columns)

# Display full text for each article in the corpus
pd.set_option('display.max_colwidth', None)
display(df_non_ann[['title', 'corpus']])  

Sampling the first batch (n=100)

In [None]:
# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai_100 = ai_articles.sample(50, random_state=42)
sample_non_ai_100 = non_ai_articles.sample(50, random_state=42)
df_non_ann_100 = pd.concat([sample_ai_100, sample_non_ai_100]).reset_index(drop=True).drop(columns=['mentions_ai'])

#verify the sample size
print(f"Number of AI-related articles in the sample: {len(sample_ai_100)}")
print(f"Number of non-AI articles in the sample: {len(sample_non_ai_100)}")
print(f"Total number of articles in the sample: {len(df_non_ann_100)}")
print(f'columns: {df_non_ann_100.columns}')

In [None]:
# save to csv file
df_path = project_root / "data" / "interim" / "articles_WSJ_batch_one.csv"
df_non_ann_100.to_csv(df_path, index=False)

Sampling the second batch (n=118)

In [None]:
# Get the current working directory, ensure being in \notebooks
current_path = Path().resolve()

# Go up one level to get the project root
project_root = current_path.parents[0]

# load the large sample from the csv 
df_path_sub500 = project_root / "data" / "interim" / "articles_WSJ_sub500.csv"
df_first_subsample = pd.read_csv(df_path_sub500)

# load first batch to exclude it from the large sample
df_path_b1 =  project_root / "data" / "interim" / "articles_WSJ_batch_one.csv"
first_batch = pd.read_csv(df_path_b1)

# verify the loaded files
print(f"Number of articles in the clean df: {len(df_first_subsample)}")
print(f"columns: {df_first_subsample.columns}")
print(f"Number of articles in the first batch: {len(first_batch)}")

In [None]:
# convert article_id to int64
first_batch['article_id'] = first_batch['article_id'].astype('int64')
df_first_subsample['article_id'] = df_first_subsample['article_id'].astype('int64')

# verify the conversion
print(first_batch['article_id'].dtype)
print(df_first_subsample['article_id'].dtype)

# filter fist_sample for article_id not in df_100_annotated
first_sample_not_annotated = df_first_subsample[~df_first_subsample['article_id'].isin(first_batch['article_id'])]

# print the number of articles in the filtered dataframe
print(f"Number of articles in the filtered dataframe: {len(first_sample_not_annotated)}")

# print the first 5 rows of the filtered dataframe
print(first_sample_not_annotated.head())

In [None]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
first_sample_not_annotated = flag_ai_mentions(first_sample_not_annotated)

In [None]:
# Construct second sample
second_sample = first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 1]
second_sample = pd.concat([second_sample, first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 0].sample(100, random_state=42)])

# shuffle the sample, reset the index and drop the old index
second_sample = second_sample.sample(frac=1, random_state=42).reset_index(drop=True)

# verify sample size   
print(f"Number of AI-related articles in the sample: {len(second_sample[second_sample['mentions_ai'] == 1])}")
print(f"Number of non-AI articles in the sample: {len(second_sample[second_sample['mentions_ai'] == 0])}")

# drop the mentions_ai column
second_sample = second_sample.drop(columns=['mentions_ai'])

# save the second sample to a csv file
output_path = project_root / "data" / "interim" / "articles_WSJ_batch_two.csv"
second_sample.to_csv(output_path, index=False)

Sampling the Third Batch (n=100)

In [None]:
# Get the current working directory, ensure being in \notebooks
current_path = Path().resolve()

# Go up one level to get the project root
project_root = current_path.parents[0]

# load the dataset from the cleaned database
db_path = project_root / "data" / "interim" / "articlesWSJ_clean.db"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# check the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")

# check for NA
print(f"Number of NA values in the full dataset: {df.isna().sum().sum()}")

# check for NA in the article_id column
print(f"Number of NA values in the article_id column: {df['article_id'].isna().sum()}") 


In [None]:
# Get the current working directory, ensure being in \notebooks
current_path = Path().resolve()

# Go up one level to get the project root
project_root = current_path.parents[0]

# Define full paths to the CSV files in the 'interim' folder
interim_path = project_root / "data" / "interim"
batch_1_path = interim_path / "articles_WSJ_batch_one.csv"
batch_2_path = interim_path / "articles_WSJ_batch_two.csv"

# Load the CSV files
batch_1 = pd.read_csv(batch_1_path)
batch_2 = pd.read_csv(batch_2_path)

# check the columns of the batch 1 and batch 2
print(f"Columns in the batch 1: {batch_1.columns}")
print(f"Columns in the batch 2: {batch_2.columns}")

# concatenate the two batches
batch_1_2 = pd.concat([batch_1, batch_2], ignore_index=True)   

# verify the concatenation
print(f"Number of articles in batch 1 and 2: {len(batch_1_2)}")

In [None]:
# exclude the articles that are already annotated
df_final = df[~df['article_id'].isin(batch_1_2['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")
print(len(df_final) + len(batch_1_2) == len(df))

In [None]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")
print(f"Number of non-AI articles in the flagged dataset: {len(flagged_df) - flagged_df['mentions_ai'].sum()}")

In [None]:
# Filter  annotated articles
ai_articles = flagged_df[flagged_df['mentions_ai'] == True]
non_ai_articles = flagged_df[flagged_df['mentions_ai'] == False]

# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(50, random_state=42)
sample_non_ai = non_ai_articles.sample(50, random_state=42)

# Combine into one DataFrame
sampled_batch_three = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
sampled_batch_three[['article_id', 'title', 'corpus', 'mentions_ai']]

# print number of articles in the batch three
print(f"Number of articles in the batch three: {len(sampled_batch_three)}")

# verify existence of 50 AI-related articles and 50 non-AI articles
print(f"Number of AI-related articles in the batch three: {len(sampled_batch_three[sampled_batch_three['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_three = sampled_batch_three.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_three.to_csv(interim_path / "articles_WSJ_batch_three.csv", index=False)

Sampling the fourth batch (n=700)

In [None]:
# Get the current working directory, ensure being in \notebooks
current_path = Path().resolve()

# Go up one level to get the project root
project_root = current_path.parents[0]

# Define full paths to the CSV files in the 'interim' folder
interim_path = project_root / "data" / "interim"

# read the batch 1, 2 & 3 from the csv file
batch_1 = pd.read_csv(interim_path / "articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv(interim_path / "articles_WSJ_batch_two.csv")
batch_3 = pd.read_csv(interim_path / "articles_WSJ_batch_three.csv")

# find columns that are not in all three batches
cols1 = set(batch_1.columns)
cols2 = set(batch_2.columns)
cols3 = set(batch_3.columns)

print("In batch_1 but not in batch_2:", cols1 - cols2)
print("In batch_2 but not in batch_1:", cols2 - cols1)

print("In batch_1 but not in batch_3:", cols1 - cols3)
print("In batch_3 but not in batch_1:", cols3 - cols1)

print("In batch_2 but not in batch_3:", cols2 - cols3)
print("In batch_3 but not in batch_2:", cols3 - cols2)

In [None]:
# concatenate the batches
batch_1_2_3 = pd.concat([batch_1, batch_2, batch_3], ignore_index=True)   

# verify the concatenation
print(f'there are 318 articles in the concatenated df:', len(batch_1_2_3) == 318)

In [None]:
# load the dataset from the cleaned database
path = interim_path / "articlesWSJ_clean.db"
conn = sqlite3.connect(path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# verify the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")
print(f"Number of articles in the full dataset: {len(df)}")

In [None]:
# subset for article_id not in batch 1, 2 and 3
batch_1_2_3['article_id'] = batch_1_2_3['article_id'].astype('int64')
df_final = df[~df['article_id'].isin(batch_1_2_3['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")

In [None]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")

In [None]:
# Randomly select 350 AI-related articles and 350 non-AI article (reproducible with seed)
sample_ai = flagged_df[flagged_df['mentions_ai'] == True].sample(350, random_state=42)
sample_non_ai = flagged_df[flagged_df['mentions_ai'] == False].sample(350, random_state=42)

# Combine into one DataFrame
sampled_batch_four = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
print(f"Number of articles in the batch four: {len(sampled_batch_four)}")
print(f"Number of AI-related articles in the batch four: {len(sampled_batch_four[sampled_batch_four['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_four = sampled_batch_four.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_four.to_csv(interim_path / "articles_WSJ_batch_four.csv", index=False)


Annotating 25% of batch 4 (n=175)

In [None]:
# sample 25% at random, reproducible with seed
df_fourth_batch_sample = df_fourth_batch.sample(frac=0.25, random_state=42).reset_index(drop=True)

# verify the sample size
print(f"Number of articles in the sample: {len(df_fourth_batch_sample)}")

In [None]:
# annotate the articles using the annotator function
df_fourth_batch_sample_annotated_author = af(df=df_fourth_batch_sample);

In [None]:
# write to csv file
df_fourth_batch_sample_annotated_author.to_csv(interim_path /  "articles_WSJ_batch_four_subsample_author.csv", index=False)