In [1]:
# importing libraries
import pandas as pd
import sqlite3
import os
import sys
from IPython.display import display
import numpy as np
import re

import sys

# first NYT 
sys.path.append(r"C:\Users\PC\Desktop\Masterarbeit\Code\NYT")
from text_cleaner_NYT import clean_nyt_article

# then WSJ 
sys.path.append(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation")
from mentions_ai import flag_ai_mentions



Load and inspect the dataframe containing the uncleaned corpus

In [2]:
# load aticles from the database
con = sqlite3.connect(r'C:\Users\PC\Desktop\Masterarbeit\Code\NYT\articlesNYT.db')
df = pd.read_sql_query("SELECT * FROM article", con)
con.close()

In [3]:
# explore columns
print(df.columns)

# check for duplicates
print(df["article_id"].duplicated().sum())

# check for missing values
print(df.isnull().sum())

# check total number of rows
print(df.shape[0])


Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id'],
      dtype='object')
0
article_id      0
image_src       0
scanned_time    0
title           0
sub_title       0
corpus          0
index_id        0
dtype: int64
9396


Load and inspect the dataframe containing the links and dates columns

In [4]:
# load index file to extract section
con = sqlite3.connect(r'C:\Users\PC\Desktop\Masterarbeit\Code\NYT\articlesNYT.db')
df_index = pd.read_sql_query("SELECT * FROM articles_index", con)
con.close()

In [5]:
# explore columns
print(df_index.columns)

# check for duplicates
print(df_index.id.duplicated().sum())

# check for missing values
print(df_index.isnull().sum())

# check total number of rows
print(df_index.shape[0])

# check link
pd.set_option('display.max_colwidth', None)
display(df_index['link'].head())

# check year, month, day
print(df_index['year'].unique())
print(df_index['month'].unique())
print(df_index['day'].unique())

Index(['id', 'year', 'month', 'day', 'headline', 'article_time', 'keyword',
       'link', 'scraped_at', 'scanned_status'],
      dtype='object')
0
id                0
year              0
month             0
day               0
headline          0
article_time      0
keyword           0
link              0
scraped_at        0
scanned_status    0
dtype: int64
9392


0    https://www.nytimes.com/2023/12/30/us/politics/pentagon-venture-capitalists.html
1             https://www.nytimes.com/2023/12/31/world/americas/cat-prison-chile.html
2            https://www.nytimes.com/2023/12/31/world/middleeast/us-houthi-clash.html
3       https://www.nytimes.com/2023/12/30/us/philadelphia-chinatown-76ers-arena.html
4                      https://www.nytimes.com/2024/01/01/us/new-state-laws-2024.html
Name: link, dtype: object

['2024']
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12']
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30'
 '31']


Construct date column in format yyyy - mm - dd for further analysis

In [6]:
# construct date column from year, month, day
df_index['date'] = df_index['year'].astype(str) + '-' + df_index['month'].astype(str) + '-' + df_index['day'].astype(str)
df_index['date'] = pd.to_datetime(df_index['date'], format='%Y-%m-%d')

# verify date column
print(df_index['date'].min(), df_index['date'].max())


2024-01-01 00:00:00 2024-12-31 00:00:00


In [7]:
# drop year, month, day columns
df_index.drop(columns=['year', 'month', 'day'], inplace=True)


Extract sections from link, investigate where no section is pressent

In [8]:
# extract section from link using regex
sections = df_index.link.str.extract(r'nytimes\.com/(?:live/)?\d{4}/\d{2}/\d{2}/([^/]+)/')[0]
df_index['section'] = sections

# check if section is extracted correctly
print(df_index['section'])

0           us
1        world
2        world
3           us
4           us
         ...  
9387        us
9388    health
9389        us
9390     world
9391     world
Name: section, Length: 9392, dtype: object


In [9]:
# inspect where section is NaN
no_sec = df_index[df_index['section'].isnull()]

# check if there are any other sections
print(no_sec.head(11))
print(f"There are {len(no_sec)} articles without a section.")

        id  \
1497  1570   
1620  1694   
3729  3868   
3938  4086   
4225  4379   
5659  5863   
6350  6570   
8502  8778   
8817  9101   

                                                                        headline  \
1497       France Moves Closer to Enshrining Abortion Access in Its Constitution   
1620       As ‘Zombie Fires’ Smolder, Canada Braces for Another Season of Flames   
3729  A Public Park or Private Spa: A City Debates the Future of an Island Oasis   
3938                  For Heroes of D-Day, This Reunion Might Be a ‘Last Hurrah’   
4225              For Most Refugees in Canada, a Warm Embrace. For Others, Jail.   
5659           Paris, Uncharacteristically Giddy, Bids Au Revoir to the Olympics   
6350                                              N.F.L. 2024 Season Predictions   
8502          ‘Mr. Every Man’: The 50 Others Accused in France’s Mass Rape Trial   
8817                                France’s First Big #MeToo Case Goes to Trial   

     article_time 

Investigating the articles without section showed that they are not related to any section in the NYT. Those are dropped.

In [10]:
# dropping the articles without a section
df_index.drop(no_sec.index, inplace=True)
print(f"After dropping the articles without a section, there are {len(df_index)} articles left.")   

After dropping the articles without a section, there are 9383 articles left.


Merging both dataframes to have one dataframe containing all relevant collumns

In [11]:
# perform inner join
df_merged = pd.merge(df, df_index, left_on='article_id', right_on="id", how='inner')
print(f"After merging the articles with the index, there are {len(df_merged)} articles left.")

After merging the articles with the index, there are 9099 articles left.


Due to introduction of a scrapinglimit of maximum of 30 articles / day for the scraping process, some article links were crawled, without the corpus being downloaded. Those are dropped, resulting in n=9099 articles.

In [12]:
# verify the merge
print(df_merged['article_id'].isnull().sum())
print(df_merged.columns)
print(df_merged['section'].unique())
print(df_merged['date'].min(), df_merged['date'].max())

0
Index(['article_id', 'image_src', 'scanned_time', 'title', 'sub_title',
       'corpus', 'index_id', 'id', 'headline', 'article_time', 'keyword',
       'link', 'scraped_at', 'scanned_status', 'date', 'section'],
      dtype='object')
['us' 'world' 'arts' 'business' 'style' 'obituaries' 'nyregion'
 'your-money' 'science' 'opinion' 'sports' 'health' 'theater' 'dining'
 'books' 'movies' 'technology' 'climate' 'upshot' 'headway' 'briefing'
 'travel' 'well' 'fashion' 'realestate' 'weather' 'magazine' 'pageoneplus'
 'education']
2024-01-01 00:00:00 2024-12-21 00:00:00


Dropping all articles from irrelevant sections, keeping 'us' 'world' "business", "science", "opinion", "technology", "briefing", 'books', "magazine", "health"

In [None]:
# Define relevant sections
relevant_sections = [
    'us', 'world', 'business', 'science', 'opinion', 'technology',
    'books', ,'health' "climate"
]

# Filter the merged DataFrame
df_filtered = df_merged[df_merged['section'].isin(relevant_sections)]

# Optional: print result count
print(f"After filtering, {len(df_filtered)} articles remain in relevant sections.")

After filtering, 7340 articles remain in relevant sections.


Dropping irrelevant collums

In [14]:
# define the columns to drop
to_drop = ['image_src', 'scanned_time', 'index_id', 'id','article_time','keyword','link','scraped_at','scanned_status']

# drop the columns
df_filtered.drop(columns=to_drop, inplace=True)

# verify the drop
print(df_filtered.columns)

Index(['article_id', 'title', 'sub_title', 'corpus', 'headline', 'date',
       'section'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=to_drop, inplace=True)


Apply seperatly constructed cleaner function

In [15]:
# apply the text cleaner function to the text column
df_filtered_clean = df_filtered.copy()
df_filtered_clean["corpus"] = df_filtered["corpus"].apply(clean_nyt_article)

In [None]:
# modify corpus column to ease the preview
df_filtered_clean["corpus_preview"] = df_filtered_clean["corpus"].str.slice(0, ) + "..."
display(df_filtered_clean[["headline", "corpus_preview"]].head())

Filter for AI-related terms

In [None]:
df_flagged = df_filtered_clean.copy()

# apply the flag_ai_mentions function to the corpus column
df_flagged = flag_ai_mentions(df_flagged)

# print share of flagged articles
print(f"Share of flagged articles: {df_flagged['mentions_ai'].mean() * 100:.2f}%")

295 out of 7349 articles mention AI-related topics.


KeyError: 'flagged'

In [20]:
# print share of flagged articles
print(f"Share of flagged articles: {df_flagged['mentions_ai'].mean() * 100:.2f}%")

# number of flagged articles
print(f"Number of flagged articles: {df_flagged['mentions_ai'].sum()}")

# unique sections of articles
print(f"Unique sections of  articles: {df_flagged.sections.unique()}")

Share of flagged articles: 4.01%
Number of flagged articles: 295


AttributeError: 'DataFrame' object has no attribute 'sections'

In [21]:
# write to csv
df_flagged.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\NYT\articlesNYT_cleaned_flagged.csv", index=False)