In [None]:
import pandas as pd
import numpy as np
import sqlite3
from IPython.display import display
import re

# import the annotator function
from annotator_function import annotate_articles_with_hype as af

# import the mentions ai function
from mentions_ai import flag_ai_mentions

# import cleaning function
from text_cleaner_WSJ import clean_article_text

Preparing the First Sample

The initial sample was drawn from the database articlesWSJ_clean_1.db. As the project progressed, an optimized version — articlesWSJ_clean_final.db — was created to capture articles scraped for days with n < 30 and subsequently apply cleaning (see clean_database_WSJ.ipynb & days_leftover_WSJ.ipynb). For full reproducibility, all database references used during sampling are explicitly included in the code.

In [None]:
# read sql file in dataframe
final_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_clean_1.db"
conn = sqlite3.connect(final_db_path) 
df = pd.read_sql_query("SELECT * FROM article", conn)

# close the connection
conn.close() 

#inspect colums 
print(df.columns)

# insect first 5 rows of the dataframe
print(df.head()) 

# check for duplicates 
print(f"There are {df.duplicated().any()} duplicates in the dataframe")

# check for duplicates in article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

 # check for null values in corpus
print(f"There are {df['corpus'].isnull().sum()} null values in the corpus column") 

# check for empty strings in corpus
print(f"There are {(df['corpus'] == '').sum()} empty strings in the corpus column")

# check number of articles
print(f"There are {df['article_id'].nunique()} unique articles in the dataframe")

# veryfy uniquenes of article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

Constructing Initial Sample (n = 500)

An initial random sample of 500 articles was drawn for manual annotation. Exploratory analysis revealed that the share of AI-related content was too low for effective BERT fine-tuning. As outlined in the exposé, the sample size was later increased to 1,018 articles. From the third batch onward, a revised sampling strategy was applied (see below).

In [None]:
# sample 510 random article from the dataframe to build annotation data set
df_sample_large = df.sample(500, random_state=42)

# clean the corpus 
df_sample_large['corpus'] = df_sample_large['corpus'].apply(lambda x: clean_article_text(x))

In [None]:
# inspect the cleaned corpus
print(df_sample_large['corpus'].head())

In [None]:
# save the cleaned sample to a csv file
df_sample_large.to_csv("articles_WSJ_sub500.csv", index=False)

AI-Related Article Filter 

Uses flag_ai_mentions() to detect AI keywords (AI, A.I., artificial intelligence, machine learning, deep learning, LLM, GPT, ChatGPT, OpenAI, transformer model, generative AI, neural network).  
Matching is case-insensitive with word boundaries to avoid false positives.  
Ensures each batch contains relevant AI content.  
See mentions_ai.py for implementation.

In [None]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
df_sample_large = flag_ai_mentions(df_sample_large)

Sample with seed 42 for annotated examples

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 2 AI-related articles and 1 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(4, random_state=42)
sample_non_ai = non_ai_articles.sample(1, random_state=42)

# Combine into one DataFrame
df_three_articles = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
df_three_articles[['article_id', 'title', 'corpus', 'mentions_ai']]


Investigate 5 flagged and an unflagged article to discuss with the annotator

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 3 AI-related articles and 2 non-AI article (reproducible with seed)
sample_ai_non_ann = ai_articles.sample(4, random_state=41)
sample_non_ai_non_ann = non_ai_articles.sample(1, random_state=41)

# Combine into one DataFrame
df_non_ann = pd.concat([sample_ai_non_ann, sample_non_ai_non_ann]).reset_index(drop=True).drop(columns=['mentions_ai'])

# show columns of the dataframe
print(df_non_ann.columns)

# Display full text for each article in the corpus
pd.set_option('display.max_colwidth', None)
display(df_non_ann[['title', 'corpus']])  

Constructing the sample for annotation, starting with 100 articles for the first batch. Use AI flags to ensure that at least 50 article mention AI.

In [None]:
# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai_100 = ai_articles.sample(50, random_state=42)
sample_non_ai_100 = non_ai_articles.sample(50, random_state=42)
df_non_ann_100 = pd.concat([sample_ai_100, sample_non_ai_100]).reset_index(drop=True).drop(columns=['mentions_ai'])

#verify the sample size
print(f"Number of AI-related articles in the sample: {len(sample_ai_100)}")
print(f"Number of non-AI articles in the sample: {len(sample_non_ai_100)}")
print(f"Total number of articles in the sample: {len(df_non_ann_100)}")
print(f'columns: {df_non_ann_100.columns}')

In [None]:
# save to csv file  
df_non_ann_100.to_csv("articles_WSJ_batch_one.csv", index=False)

Annotate sample with 100 examples

In [None]:
# read the csv file
df_100_sample = pd.read_csv("articles_WSJ_batch_one.csv")

# Display the result
display(df_100_sample[['article_id', 'title', 'corpus']][1:2])

# verify the sample size
print(f'There are {df_100_sample.shape[0]} articles in the dataframe')

In [None]:
# annotate the articles using the annotator function
first_Batch_articles_WSJ_author = af(df=df_100_sample);

In [None]:
# verify the annotation process
print(f"Number of articles in the 100 sample: {len(first_Batch_articles_WSJ_author)}")
print(f"Number of articles in the annotated 100 sample: {len(first_Batch_articles_WSJ_author)}")

# check for unannotated articles
print(f"Number of articles with AI-related annotation: {first_Batch_articles_WSJ_author['label_ai_related'].notnull().sum()}")

# write the annotated sample to a csv file first_Batch_articles_WSJ_author
df_100_annotated.to_csv("articles_WSJ_batch_one_author.csv", index=False)  

In [None]:
# load clean df
first_sample = pd.read_csv("articles_WSJ_batch_one_author.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(first_sample)}")
print(f"columns: {first_sample.columns}")

In the next part, the second set of 100 sampled articles is created, to which 18 are added which mention AI as filtered by AI-related keywords and phrases (see bellow)

In [None]:
# load the large sample from the csv file
df_first_subsample = pd.read_csv("articles_WSJ_sub500.csv")

# load first batch to exclude it from the large sample
first_batch = pd.read_csv("articles_WSJ_batch_one_author.csv")

# verify the loaded files
print(f"Number of articles in the clean df: {len(df_first_subsample)}")
print(f"columns: {df_first_subsample.columns}")
print(f"Number of articles in the first batch: {len(first_batch)}")

In [None]:
# convert article_id to int64
first_batch['article_id'] = first_batch['article_id'].astype('int64')
df_first_subsample['article_id'] = df_first_subsample['article_id'].astype('int64')

# verify the conversion
print(first_batch['article_id'].dtype)
print(df_first_subsample['article_id'].dtype)

# filter fist_sample for article_id not in df_100_annotated
first_sample_not_annotated = df_first_subsample[~df_first_subsample['article_id'].isin(first_batch['article_id'])]

# print the number of articles in the filtered dataframe
print(f"Number of articles in the filtered dataframe: {len(first_sample_not_annotated)}")

# print the first 5 rows of the filtered dataframe
print(first_sample_not_annotated.head())

In [None]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
first_sample_not_annotated = flag_ai_mentions(first_sample_not_annotated)

In [None]:
# Construct second sample
second_sample_100 = first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 1]
second_sample_100 = pd.concat([second_sample_100, first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 0].sample(100, random_state=42)])

# shuffle the sample, reset the index and drop the old index
second_sample_100 = second_sample_100.sample(frac=1, random_state=42).reset_index(drop=True)

# verify sample size   
print(f"Number of AI-related articles in the sample: {len(second_sample_100[second_sample_100['mentions_ai'] == 1])}")
print(f"Number of non-AI articles in the sample: {len(second_sample_100[second_sample_100['mentions_ai'] == 0])}")

# drop the mentions_ai column
second_sample_100 = second_sample_100.drop(columns=['mentions_ai'])

# save the second sample to a csv file
second_sample_100.to_csv("articles_WSJ_batch_two.csv", index=False)


In [None]:
# verify the saved file
df_second_sample_100 = pd.read_csv("articles_WSJ_batch_two.csv")

# Display the result
display(df_second_sample_100[['article_id', 'title', 'corpus']][1:10])

# print the number of articles in the second sample
print(f"Number of articles in the second sample: {len(df_second_sample_100)}")

# print the columns of the second sample
print(f"Columns in the second sample: {df_second_sample_100.columns}")

Next, the second batch is independently annotated by the author

In [None]:
# annotate the articles using the annotator function
df_second_batch_annotated_author = af(df=df_second_batch);

In [None]:
# write to csv file
df_second_batch_annotated_author.to_csv("Carticles_WSJ_batch_two_author.csv", index=False)

In [None]:
# veryfy the annotation process
print(f"Number of articles in the second sample: {len(df_second_sample)}")
print(f"Number of articles in the annotated second sample: {len(df_second_sample_annotated)}")
print(f"Number of articles with AI-related annotation: {df_second_sample_annotated['label_ai_related'].sum()}")
print(f"Columns in the annotated second batch: {df_second_sample_annotated.columns}")

Sampling the Third Batch

As noted above, the third batch is drawn from the full corpus rather than the initial sub-sample. From this point onward, the optimized database (articlesWSJ_clean_final.db) is used as the data source. Given this, clean_article_text does not have to be applied. 

In [None]:
# load the dataset from the cleaned database
path = "articlesWSJ_clean_final.db"
conn = sqlite3.connect(path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# check the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")

# check for NA
print(f"Number of NA values in the full dataset: {df.isna().sum().sum()}")

# check for NA in the article_id column
print(f"Number of NA values in the article_id column: {df['article_id'].isna().sum()}") 


In [None]:
# load batch 1 and 2 to exclude them from the full dataset
batch_1 = pd.read_csv("articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv("articles_WSJ_batch_two.csv")

# check the columns of the batch 1 and batch 2
print(f"Columns in the batch 1: {batch_1.columns}")
print(f"Columns in the batch 2: {batch_2.columns}")

# concatenate the two batches
batch_1_2 = pd.concat([batch_1, batch_2], ignore_index=True)   

# verify the concatenation
print(f"Number of articles in batch 1 and 2: {len(batch_1_2)}")

In [None]:
# exclude the articles that are already annotated
df_final = df[~df['article_id'].isin(batch_1_2['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")
print(len(df_final) + len(batch_1_2) == len(df))

In [None]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")
print(f"Number of non-AI articles in the flagged dataset: {len(flagged_df) - flagged_df['mentions_ai'].sum()}")

In [None]:
# Filter  annotated articles
ai_articles = flagged_df[flagged_df['mentions_ai'] == True]
non_ai_articles = flagged_df[flagged_df['mentions_ai'] == False]

# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(50, random_state=42)
sample_non_ai = non_ai_articles.sample(50, random_state=42)

# Combine into one DataFrame
sampled_batch_three = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
sampled_batch_three[['article_id', 'title', 'corpus', 'mentions_ai']]

# print number of articles in the batch three
print(f"Number of articles in the batch three: {len(sampled_batch_three)}")

# verify existence of 50 AI-related articles and 50 non-AI articles
print(f"Number of AI-related articles in the batch three: {len(sampled_batch_three[sampled_batch_three['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_three = sampled_batch_three.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_three.to_csv("articles_WSJ_batch_three.csv", index=False)

In [None]:
# verify the saved file
print(sampled_batch_three.columns)

Next, the third batch is independently annotated by the author

In [None]:
# load the third sample from the csv file
df_third_batch = pd.read_csv("articles_WSJ_batch_three.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(df_third_batch)}")
print(f"columns: {df_third_batch.columns}")  

In [None]:
# annotate the articles using the annotator function
df_third_batch_annotated_author = af(df=df_third_batch);

In [None]:
# write to csv file
df_third_batch_annotated_author.to_csv("articles_WSJ_batch_three_author.csv", index=False)

In [None]:
# verify the csv file
df_third_batch_annotated = pd.read_csv("articles_WSJ_batch_three_author.csv")
# check the columns of the annotated batch three
print(f"Columns in the annotated batch three: {df_third_batch_annotated.columns}")
print(f"Number of articles in the annotated batch three: {len(df_third_batch_annotated)}")
print(f"Number of articles with AI-related annotation: {df_third_batch_annotated['label_ai_related'].sum()}")
print(f"The total hype score is: {df_third_batch_annotated['hype_level'].sum()}")

Next, the fourth (final) batch (n=700) is created

In [None]:
# exclude  batch 1, batch 2 and batch 3 from the full dataset

# read the batch 1, 2 & 3 from the csv file
batch_1 = pd.read_csv("articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv("articles_WSJ_batch_two.csv")
batch_3 = pd.read_csv("articles_WSJ_batch_three.csv")

# find columns that are not in all three batches
cols1 = set(batch_1.columns)
cols2 = set(batch_2.columns)
cols3 = set(batch_3.columns)

print("In batch_1 but not in batch_2:", cols1 - cols2)
print("In batch_2 but not in batch_1:", cols2 - cols1)

print("In batch_1 but not in batch_3:", cols1 - cols3)
print("In batch_3 but not in batch_1:", cols3 - cols1)

print("In batch_2 but not in batch_3:", cols2 - cols3)
print("In batch_3 but not in batch_2:", cols3 - cols2)

In [None]:
# due to optimization of the cleaning process (see above), some modifacations have to be made to allow merging of the three batches

# drop colukns to ensure that the columns of batch 1 and 2 are identical to batch 3
batch_1 = batch_1.drop(columns=["date","section"])
batch_3 = batch_3.drop(columns=["date","section","index_id","scanned_time","image_src","date"])

# rename the columns to match batch 1 and 2
batch_2 = batch_2.rename(columns={"cleaned_corpus": "corpus",})

# verify identity of the columns
print(f'columns are identical: {set(batch_1.columns) == set(batch_2.columns) == set(batch_3.columns)}')

In [None]:
# concatenate the two batches
batch_1_2_3 = pd.concat([batch_1, batch_2, batch_3], ignore_index=True)   

# verify the concatenation
print(f'there are 318 articles in the concatenated df:', len(batch_1_2_3) == 318)

In [None]:
# load the dataset from the cleaned database
path = "articlesWSJ_clean_final.db"
conn = sqlite3.connect(path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# verify the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")
print(f"Number of articles in the full dataset: {len(df)}")

In [None]:
# subset for article_id not in batch 1, 2 and 3
batch_1_2_3['article_id'] = batch_1_2_3['article_id'].astype('int64')
df_final = df[~df['article_id'].isin(batch_1_2_3['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")

In [None]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")

In [None]:
# Randomly select 350 AI-related articles and 350 non-AI article (reproducible with seed)
sample_ai = flagged_df[flagged_df['mentions_ai'] == True].sample(350, random_state=42)
sample_non_ai = flagged_df[flagged_df['mentions_ai'] == False].sample(350, random_state=42)

# Combine into one DataFrame
sampled_batch_four = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
print(f"Number of articles in the batch four: {len(sampled_batch_four)}")
print(f"Number of AI-related articles in the batch four: {len(sampled_batch_four[sampled_batch_four['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_four = sampled_batch_four.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_four.to_csv("articles_WSJ_batch_four.csv", index=False)


In [None]:
# verify the saved file
df_fourth_batch = pd.read_csv("\articles_WSJ_batch_four.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(df_fourth_batch)}")
print(f"columns: {df_fourth_batch.columns}") 