In [18]:
import pandas as pd
import numpy as np
import sqlite3
from IPython.display import display
import re

# import the annotator function
from annotator_function import annotate_articles_with_hype as af

# import the mentions ai function
from mentions_ai import flag_ai_mentions

# import cleaning function
from text_cleaner_WSJ import clean_article_text

Preparing the First Sample

The initial sample was drawn from the database articlesWSJ_clean_1.db. As the project progressed, an optimized version — articlesWSJ_clean_final.db — was created to capture articles scraped for days with n < 30 and subsequently apply cleaning (see clean_database_WSJ.ipynb & days_leftover_WSJ.ipynb). For full reproducibility, all database references used during sampling are explicitly included in the code.

In [89]:
# read sql file in dataframe
final_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_clean_1.db"
conn = sqlite3.connect(final_db_path) 
df = pd.read_sql_query("SELECT * FROM article", conn)

# close the connection
conn.close() 

#inspect colums 
print(df.columns)

# insect first 5 rows of the dataframe
print(df.head()) 

# check for duplicates 
print(f"There are {df.duplicated().any()} duplicates in the dataframe")

# check for duplicates in article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

 # check for null values in corpus
print(f"There are {df['corpus'].isnull().sum()} null values in the corpus column") 

# check for empty strings in corpus
print(f"There are {(df['corpus'] == '').sum()} empty strings in the corpus column")

# check number of articles
print(f"There are {df['article_id'].nunique()} unique articles in the dataframe")

# veryfy uniquenes of article_id
print(f"There are {df['article_id'].duplicated().sum()} duplicates in the article_id column")

Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')
   article_id image_src         scanned_time  \
0       13068            2025-04-01 09:47:17   
1       13069            2025-04-01 09:47:27   
2       13070            2025-04-01 09:47:37   
3       13071            2025-04-01 09:47:49   
4       13072            2025-04-01 09:47:59   

                                                                     title  \
0  Baidu Terminates $3.6B Deal to Buy JOYY’s China Live-Streaming Business   
1                                      The Military’s Phantom ‘Extremists’   
2                                        Double Dipping in Opioid Lawsuits   
3                                           Xi Jinping Says Happy New Year   
4        Israel Reshuffles Forces, Prepares for Long-Term Conflict in Gaza   

                                                                                                                      

Constructing Initial Sample (n = 500)

An initial set of 500 articles was randomly sampled from the full corpus for manual annotation. Exploratory analysis, showed that the proportion of AI-related content was too low for effective BERT fine-tuning. As discussed in the exposé, the annotated sample size was later increased to n = 1000. Starting with the third batch, a revised sampling strategy was implemented (see below).

In [None]:
# sample 510 random article from the dataframe to build annotation data set
df_sample_large = df.sample(500, random_state=42)

# clean the corpus 
df_sample_large['corpus'] = df_sample_large['corpus'].apply(lambda x: clean_article_text(x))

In [100]:
# inspect the cleaned corpus
print(df_sample_large['corpus'].head())

9843                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [None]:
# save the cleaned sample to a csv file
df_sample_large.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_sub500.csv", index=False)

Filtering for AI-Relevant Articles Using a Custom Function

To identify AI-related content within the sample, a custom filtering function (flag_ai_mentions) was employed. The filter relies on a predefined set of keywords: AI, A.I., artificial intelligence, machine learning, deep learning, LLM, GPT, ChatGPT, OpenAI, transformer model, generative AI, and neural network. These terms were selected based on an initial manual review of Wall Street Journal articles.

The matching is case-insensitive and uses word boundaries to avoid partial matches (e.g., "ai" would match "AI", but "aimless" would not match "ai"). The purpose of this filtering step is to ensure that each sampled batch contains a meaningful proportion of AI-related content. The corresponding function is implemented in the script mentions_ai.py.

In [102]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
df_sample_large = flag_ai_mentions(df_sample_large)

68 out of 500 articles mention AI-related topics.


Sample with seed 42 for annotated examples

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 2 AI-related articles and 1 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(4, random_state=42)
sample_non_ai = non_ai_articles.sample(1, random_state=42)

# Combine into one DataFrame
df_three_articles = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
df_three_articles[['article_id', 'title', 'corpus', 'mentions_ai']]


Investigate 5 flagged and an unflagged article to discuss with the annotator

In [None]:
# Filter  annotated articles
ai_articles = df_sample_large[df_sample_large['mentions_ai'] == True]
non_ai_articles = df_sample_large[df_sample_large['mentions_ai'] == False]

# Randomly select 3 AI-related articles and 2 non-AI article (reproducible with seed)
sample_ai_non_ann = ai_articles.sample(4, random_state=41)
sample_non_ai_non_ann = non_ai_articles.sample(1, random_state=41)

# Combine into one DataFrame
df_non_ann = pd.concat([sample_ai_non_ann, sample_non_ai_non_ann]).reset_index(drop=True).drop(columns=['mentions_ai'])

# show columns of the dataframe
print(df_non_ann.columns)

# Display full text for each article in the corpus
pd.set_option('display.max_colwidth', None)
display(df_non_ann[['title', 'corpus']])  

Constructing the sample for annotation, starting with 100 articles for the first batch. Use AI flags to ensure that at least 50 article mention AI in some way (see above)

In [105]:
# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai_100 = ai_articles.sample(50, random_state=42)
sample_non_ai_100 = non_ai_articles.sample(50, random_state=42)
df_non_ann_100 = pd.concat([sample_ai_100, sample_non_ai_100]).reset_index(drop=True).drop(columns=['mentions_ai'])

#verify the sample size
print(f"Number of AI-related articles in the sample: {len(sample_ai_100)}")
print(f"Number of non-AI articles in the sample: {len(sample_non_ai_100)}")
print(f"Total number of articles in the sample: {len(df_non_ann_100)}")
print(f'columns: {df_non_ann_100.columns}')

Number of AI-related articles in the sample: 50
Number of non-AI articles in the sample: 50
Total number of articles in the sample: 100
columns: Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')


In [108]:
# save to csv file  
df_non_ann_100.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one.csv", index=False)

Annotate sample with 100 examples

In [109]:
# read the csv file
df_100_sample = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one.csv")

# Display the result
display(df_100_sample[['article_id', 'title', 'corpus']][1:2])

# verify the sample size
print(f'There are {df_100_sample.shape[0]} articles in the dataframe')

Unnamed: 0,article_id,title,corpus
1,31336,AI Is Helping Scammers Outsmart You—and Your Bank,"ILLUSTRATION. MARK. HARRIS,. ISTOCK 7. June 22, 530 am. ET 8 minutes. Artificial intelligence is making scammers tougher to spot. Gone are the poorly worded messages that easily tipped off authorities as well as the grammar police. The bad guys are now better writers and more convincing conversationalists, who can hold a conversation without revealing they are a bot, say the bank and tech investigators who spend their days tracking the latest schemes. ChatGPT and other. AI tools can even enable scammers to create an imitation of your voice and identity. In recent years, criminals have used. AI-based software to impersonate senior executives and demand wire transfers. ""Your spidey senses are no longer going to prevent you from being victimized,"" said. Matt. O'Neill, a former. Secret. Service agent and co-founder of cybersecurity firm 5OH. Consulting. In these recent cases, the frauds are often similar to old scams. But. AI has enabled scammers to target much larger groups and use more personal information to convince you the scam is real. Fraud-prevention officials say these tactics are often harder to spot because they bypass traditional indicators of scams, such as malicious links and poor wording and grammar. Criminals today are faking driver's licenses and other identification in an attempt to open new bank accounts and adding computer-generated faces and graphics to pass identity-verification processes. All of these methods are hard to stave off, say the officials. JPMorgan. Chase has begun using large-language models to validate payments, which helps fight fraud. Carisma. Ramsey. Fields, vice president of external communications at. JPMorgan. Chase, said the bank has also stepped up its efforts to educate customers about scams. And while banks stop some fraud, the last line of defense will always be you. These security officials say to never financial or personal information unless you're certain about who's on the receiving end. If you do pay, use a credit card because it offers the most protection. ""Somebody who tells you to pay by crypto, cash, gold, wire transfer or a payment app is likely a scam,"" said. Lois. Greisman, an associate director of the. Federal. Trade. Commission. With. AI as an accomplice, fraudsters are reaping more money from victims of all ages. People reported losing a record 10 billion to scams in , up from 9 billion a year prior, according to the. FTC. Since the. FTC estimates only 5 of fraud victims report their losses, the actual number could be closer to 200 billion. Joey. Rosati, who owns a small cryptocurrency firm, never thought he could fall for a scam until a man he believed to be a police officer called him in. May. Finance entrepreneur. Joey. Rosati, who was almost scammed, was surprised how convincing and knowledgeable fraudsters can be. The man told. Rosati he had missed jury duty. The man seemed to know all about him, including his. Social. Security number and that he had just moved to a new house. Rosati followed the officer's instruction to come down to the station in. Hillsborough. County,. Fla. which didn't seem like something a scammer would suggest. On the drive over,. Rosati was asked to wire 4,500 to take care of the fine before he arrived. It was then that. Rosati realized it was a scam and hung up. ""I'm not uneducated, young, immature. I have my head on my shoulders,"". Rosati said. ""But they were perfect."". Social-engineering attacks like the jury-duty scam have grown more sophisticated with. AI. Scammers use. AI tools to unearth details about targets from social media and data breaches, cybersecurity experts say. AI can help them adapt their schemes in real time messages that convincingly mimic trusted individuals, persuading targets to send money or divulge sensitive information. A job scammer played on the emotions of. David. Wenyu, who had been unemployed for six months. David. Wenyu's. LinkedIn profile displayed an ""open to work"" banner when he received an email in. May offering a job opportunity. It appeared to be from. SmartLight. Analytics, a legitimate company, and came six months after he had lost his job. He accepted the offer, even though he noticed the email address was slightly different from those on the company's website. The company issued him a check to purchase work-from-home equipment from a specific website. When they told him to buy the supplies before the money showed up in his account, he knew it was a scam. ""I was just emotionally too desperate, so. I ignored those red flags,"". Wenyu said. In an. April survey of 600 fraud-management officials at banks and financial institutions company. Biocatch, 70 said the criminals were more skilled at using. AI for financial crime than banks are at using it for prevention. Kimberly. Sutherland, vice president of fraud and identity strategy at. LexisNexis. Risk. Solutions, said there has been a noticeable rise in fraud attempts that appear to be. AI related in . Password risks, amplified. Criminals used to have to guess or steal passwords through phishing attacks or data breaches, often targeting high-value accounts one by one. Now, scammers can quickly cross-reference and test reused passwords across platforms. They can use. AI systems to write code that would automate various aspects of their ploys,. O'Neill said. If scammers obtain your email and a commonly used password from a tech company data breach,. AI tools can swiftly check if the same credentials unlock your bank, social media or shopping accounts. Financial institutions are taking new stepsand tapping. AI themselvesto shield your money and data. Banks monitor how you enter credentials, whether you tend to use your left or right hand when swiping on the app, and your device's. IP address to build a profile on you. If a login attempt doesn't match your typical behavior, it is flagged, and you may be prompted to provide more information before proceeding. They can tell when you're being coerced into filling out information, because of shifts in your typing cadence. If digits are copied and pasted, if the voice verification is too perfect, or if is too evenly spaced and grammatically correct, that is a red flag, said. Jim. Taylor, chief product officer at. RSA. Security, a firm with fraud-detection tech used ,. Citibank and others. Consumers paid scammers 1.4 billion in cryptocurrency in , up more than 250 from , according to. FTC data. What steps do you take to protect yourself and your money? Join the conversation below. As a result, security officials suggest that you turn on two-factor authentication, so you get a or email whenever someone tries logging into one of your accounts. If anything feels off during a potential money exchange, take a beat. Pressing pause on a potentially fraudulent situation is also important psychologically. Many scammers try to create a false urgency or confuse victims to manipulate them. If all the information about a transaction or account is coming from one person, that is a red flag. Get a second opinion from a trusted contact. ""If it's going to hurt if you lose it, validate it,"". O'Neill, the former. Secret. Service agent said. JPMorgan. Chase has begun using large-language models to validate payments, which helps fight fraud. An earlier version of this article incorrectly said the bank was using the models to fight identity fraud. Corrected on. June 27. To. Fix. Your. Food. Budget,. Trim the. Waste"


There are 100 articles in the dataframe


In [None]:
# annotate the articles using the annotator function
first_Batch_articles_WSJ_author = af(df=df_100_sample);

In [4]:
# verify the annotation process
print(f"Number of articles in the 100 sample: {len(first_Batch_articles_WSJ_author)}")
print(f"Number of articles in the annotated 100 sample: {len(first_Batch_articles_WSJ_author)}")

# check for unannotated articles
print(f"Number of articles with AI-related annotation: {first_Batch_articles_WSJ_author['label_ai_related'].notnull().sum()}")

# write the annotated sample to a csv file first_Batch_articles_WSJ_author
df_100_annotated.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_author.csv", index=False)  

NameError: name 'first_Batch_articles_WSJ_author' is not defined

In [5]:
# load clean df
first_sample = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_author.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(first_sample)}")
print(f"columns: {first_sample.columns}")

Number of articles in the clean df: 100
columns: Index(['article_id', 'title', 'sub_title', 'cleaned_corpus',
       'label_ai_related', 'hype_level'],
      dtype='object')


In the next part, the second set of 100 sampled articles is created, to which 18 are added which mention AI as filtered by AI-related keywords and phrases (see bellow)

In [8]:
# load the large sample from the csv file
df_first_subsample = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_sub500.csv")

# load first batch to exclude it from the large sample
first_batch = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_author.csv")

# verify the loaded files
print(f"Number of articles in the clean df: {len(df_first_subsample)}")
print(f"columns: {df_first_subsample.columns}")
print(f"Number of articles in the first batch: {len(first_batch)}")

Number of articles in the clean df: 500
columns: Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')
Number of articles in the first batch: 100


In [9]:
# convert article_id to int64
first_batch['article_id'] = first_batch['article_id'].astype('int64')
df_first_subsample['article_id'] = df_first_subsample['article_id'].astype('int64')

# verify the conversion
print(first_batch['article_id'].dtype)
print(df_first_subsample['article_id'].dtype)

# filter fist_sample for article_id not in df_100_annotated
first_sample_not_annotated = df_first_subsample[~df_first_subsample['article_id'].isin(first_batch['article_id'])]

# print the number of articles in the filtered dataframe
print(f"Number of articles in the filtered dataframe: {len(first_sample_not_annotated)}")

# print the first 5 rows of the filtered dataframe
print(first_sample_not_annotated.head())

int64
int64
Number of articles in the filtered dataframe: 400
   article_id  image_src         scanned_time  \
0       29053        NaN  2025-04-04 15:51:12   
1       19489        NaN  2025-04-02 20:38:29   
3       23657        NaN  2025-04-03 14:50:35   
4       20373        NaN  2025-04-03 01:54:34   
5       15366        NaN  2025-04-02 00:54:27   

                                               title  \
0  Fisker Withdraws Guidance Amid Strategic Optio...   
1  Oneok to Acquire EnLink Midstream in $4.3 Bill...   
3  Walmart, Coming Off a Strong Year, Invests to ...   
4  Trump Tax Cuts Come at the Right Time for Stoc...   
5                 Boeing and the Automation Standoff   

                                           sub_title  \
0  Fisker said its options may include restructur...   
1  The deal is expected to close in the first qua...   
3  Company tries to boost business with its $2.3 ...   
4  Lower taxes would be a boon for an equity mark...   
5  A plane maker has too m

In [10]:
# flag articles with AI mentions by setting mentioned_ai to 1 if the article contains any of the AI-related keywords, 0 otherwise
first_sample_not_annotated = flag_ai_mentions(first_sample_not_annotated)

18 out of 400 articles mention AI-related topics.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mentions_ai'] = df[text_column].apply(lambda x: bool(ai_pattern.search(str(x))))


In [11]:
# Construct second sample
second_sample_100 = first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 1]
second_sample_100 = pd.concat([second_sample_100, first_sample_not_annotated[first_sample_not_annotated['mentions_ai'] == 0].sample(100, random_state=42)])

# shuffle the sample, reset the index and drop the old index
second_sample_100 = second_sample_100.sample(frac=1, random_state=42).reset_index(drop=True)

# verify sample size   
print(f"Number of AI-related articles in the sample: {len(second_sample_100[second_sample_100['mentions_ai'] == 1])}")
print(f"Number of non-AI articles in the sample: {len(second_sample_100[second_sample_100['mentions_ai'] == 0])}")

# drop the mentions_ai column
second_sample_100 = second_sample_100.drop(columns=['mentions_ai'])

# save the second sample to a csv file
second_sample_100.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two.csv", index=False)


Number of AI-related articles in the sample: 18
Number of non-AI articles in the sample: 100


In [None]:
# verify the saved file
df_second_sample_100 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two.csv")

# Display the result
display(df_second_sample_100[['article_id', 'title', 'corpus']][1:10])

# print the number of articles in the second sample
print(f"Number of articles in the second sample: {len(df_second_sample_100)}")

# print the columns of the second sample
print(f"Columns in the second sample: {df_second_sample_100.columns}")

Next, the second batch is independently annotated by the author

In [None]:
# annotate the articles using the annotator function
df_second_batch_annotated_author = af(df=df_second_batch);

In [None]:
# write to csv file
df_second_batch_annotated_author.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two_author.csv", index=False)

In [None]:
# veryfy the annotation process
print(f"Number of articles in the second sample: {len(df_second_sample)}")
print(f"Number of articles in the annotated second sample: {len(df_second_sample_annotated)}")
print(f"Number of articles with AI-related annotation: {df_second_sample_annotated['label_ai_related'].sum()}")
print(f"Columns in the annotated second batch: {df_second_sample_annotated.columns}")

Sampling the Third Batch

As noted above, the third batch is drawn from the full corpus rather than the initial sub-sample. From this point onward, the optimized database (articlesWSJ_clean_final.db) is used as the data source. Given this, clean_article_text does not have to be applied. 

In [14]:
# load the dataset from the cleaned database
path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_clean_final.db"
conn = sqlite3.connect(path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# check the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")

# check for NA
print(f"Number of NA values in the full dataset: {df.isna().sum().sum()}")

# check for NA in the article_id column
print(f"Number of NA values in the article_id column: {df['article_id'].isna().sum()}") 


In [12]:
# load batch 1 and 2 to exclude them from the full dataset
batch_1 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two.csv")

# check the columns of the batch 1 and batch 2
print(f"Columns in the batch 1: {batch_1.columns}")
print(f"Columns in the batch 2: {batch_2.columns}")

# concatenate the two batches
batch_1_2 = pd.concat([batch_1, batch_2], ignore_index=True)   

# verify the concatenation
print(f"Number of articles in batch 1 and 2: {len(batch_1_2)}")

Columns in the batch 1: Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')
Columns in the batch 2: Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')
Number of articles in batch 1 and 2: 218


In [15]:
# exclude the articles that are already annotated
df_final = df[~df['article_id'].isin(batch_1_2['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")
print(len(df_final) + len(batch_1_2) == len(df))

Number of articles in the final dataset: 14040
True


In [16]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")
print(f"Number of non-AI articles in the flagged dataset: {len(flagged_df) - flagged_df['mentions_ai'].sum()}")

1239 out of 14040 articles mention AI-related topics.
Number of articles in the flagged dataset: 14040
Number of AI-related articles in the flagged dataset: 1239
Number of non-AI articles in the flagged dataset: 12801


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mentions_ai'] = df[text_column].apply(lambda x: bool(ai_pattern.search(str(x))))


In [17]:
# Filter  annotated articles
ai_articles = flagged_df[flagged_df['mentions_ai'] == True]
non_ai_articles = flagged_df[flagged_df['mentions_ai'] == False]

# Randomly select 50 AI-related articles and 50 non-AI article (reproducible with seed)
sample_ai = ai_articles.sample(50, random_state=42)
sample_non_ai = non_ai_articles.sample(50, random_state=42)

# Combine into one DataFrame
sampled_batch_three = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
sampled_batch_three[['article_id', 'title', 'corpus', 'mentions_ai']]

# print number of articles in the batch three
print(f"Number of articles in the batch three: {len(sampled_batch_three)}")

# verify existence of 50 AI-related articles and 50 non-AI articles
print(f"Number of AI-related articles in the batch three: {len(sampled_batch_three[sampled_batch_three['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_three = sampled_batch_three.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_three.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three.csv", index=False)

Number of articles in the batch three: 100
Number of AI-related articles in the batch three: 50


In [None]:
# verify the saved file
print(sampled_batch_three.columns)

Next, the third batch is independently annotated by the author

In [None]:
# load the third sample from the csv file
df_third_batch = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(df_third_batch)}")
print(f"columns: {df_third_batch.columns}")  

In [None]:
# annotate the articles using the annotator function
df_third_batch_annotated_author = af(df=df_third_batch);

In [None]:
# write to csv file
df_third_batch_annotated_author.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three_author.csv", index=False)

In [None]:
# verify the csv file
df_third_batch_annotated = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three_author.csv")
# check the columns of the annotated batch three
print(f"Columns in the annotated batch three: {df_third_batch_annotated.columns}")
print(f"Number of articles in the annotated batch three: {len(df_third_batch_annotated)}")
print(f"Number of articles with AI-related annotation: {df_third_batch_annotated['label_ai_related'].sum()}")
print(f"The total hype score is: {df_third_batch_annotated['hype_level'].sum()}")

Next, the fourth (final) batch (n=700) is created

In [None]:
# exclude  batch 1, batch 2 and batch 3 from the full dataset

#read the batch 1, 2 & 3 from the csv file
batch_1 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one.csv")
batch_2 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two.csv")
batch_3 = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three.csv")

# find columns that are not in all three batches
cols1 = set(batch_1.columns)
cols2 = set(batch_2.columns)
cols3 = set(batch_3.columns)

print("In batch_1 but not in batch_2:", cols1 - cols2)
print("In batch_2 but not in batch_1:", cols2 - cols1)

print("In batch_1 but not in batch_3:", cols1 - cols3)
print("In batch_3 but not in batch_1:", cols3 - cols1)

print("In batch_2 but not in batch_3:", cols2 - cols3)
print("In batch_3 but not in batch_2:", cols3 - cols2)

In batch_1 but not in batch_2: {'section', 'corpus', 'date'}
In batch_2 but not in batch_1: {'cleaned_corpus'}
In batch_1 but not in batch_3: set()
In batch_3 but not in batch_1: {'image_src', 'index_id', 'scanned_time'}
In batch_2 but not in batch_3: {'cleaned_corpus'}
In batch_3 but not in batch_2: {'section', 'date', 'image_src', 'index_id', 'corpus', 'scanned_time'}


In [None]:
# due to optimization of the cleaning process (see above), some modifacations have to be made to allow merging of the three batches

# drop colukns to ensure that the columns of batch 1 and 2 are identical to batch 3
batch_1 = batch_1.drop(columns=["date","section"])
batch_3 = batch_3.drop(columns=["date","section","index_id","scanned_time","image_src","date"])

# rename the columns to match batch 1 and 2
batch_2 = batch_2.rename(columns={"cleaned_corpus": "corpus",})

# verify identity of the columns
print(f'columns are identical: {set(batch_1.columns) == set(batch_2.columns) == set(batch_3.columns)}')

columns are identical: True


In [43]:
# concatenate the two batches
batch_1_2_3 = pd.concat([batch_1, batch_2, batch_3], ignore_index=True)   

# verify the concatenation
print(f'there are 318 articles in the concatenated df:', len(batch_1_2_3) == 318)

there are 318 articles in the concatenated df: True


In [None]:
# load the dataset from the cleaned database
path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_clean_final.db"
conn = sqlite3.connect(path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

In [None]:
# verify the columns of the full dataset
print(f"Columns in the full dataset: {df.columns}")
print(f"Number of articles in the full dataset: {len(df)}")

In [None]:
# subset for article_id not in batch 1, 2 and 3
batch_1_2_3['article_id'] = batch_1_2_3['article_id'].astype('int64')
df_final = df[~df['article_id'].isin(batch_1_2_3['article_id'])]

# verify the exclusion
print(f"Number of articles in the final dataset: {len(df_final)}")

NameError: name 'batch_1_2_3' is not defined

In [None]:
# flag ai-related articles in the final dataset
flagged_df = flag_ai_mentions(df_final)

# verify the flagging process
print(f"Number of articles in the flagged dataset: {len(flagged_df)}")
print(f"Number of AI-related articles in the flagged dataset: {flagged_df['mentions_ai'].sum()}")

In [None]:
# Randomly select 350 AI-related articles and 350 non-AI article (reproducible with seed)
sample_ai = flagged_df[flagged_df['mentions_ai'] == True].sample(350, random_state=42)
sample_non_ai = flagged_df[flagged_df['mentions_ai'] == False].sample(350, random_state=42)

# Combine into one DataFrame
sampled_batch_four = pd.concat([sample_ai, sample_non_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Display the result
print(f"Number of articles in the batch four: {len(sampled_batch_four)}")
print(f"Number of AI-related articles in the batch four: {len(sampled_batch_four[sampled_batch_four['mentions_ai'] == 1])}")

# drop the mentions_ai column
sampled_batch_four = sampled_batch_four.drop(columns=['mentions_ai'])

# write to csv file
sampled_batch_four.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_four.csv", index=False)


In [None]:
# verify the saved file
df_fourth_batch = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_four.csv")

# verify the loaded file
print(f"Number of articles in the clean df: {len(df_fourth_batch)}")
print(f"columns: {df_fourth_batch.columns}") 