In [2]:
# import libraries
import os
import pandas as pd
import sqlite3
from text_cleaner import clean_article_text

In [38]:
# connect to the SQLite database and read the data into a DataFrame
db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ.db"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM articles_index", conn)
conn.close()

In [39]:
#explore the data
print(df.head())
print(df.info())    
print(df.columns)

   id  year month day                                           headline  \
0   1  2024     1   1  Baidu Terminates $3.6B Deal to Buy JOYY’s Chin...   
1   2  2024     1   1                The Military’s Phantom ‘Extremists’   
2   3  2024     1   1                  Double Dipping in Opioid Lawsuits   
3   4  2024     1   1                     Xi Jinping Says Happy New Year   
4   5  2024     1   1  Israel Reshuffles Forces, Prepares for Long-Te...   

  article_time keyword                                               link  \
0   6:44 PM ET     N/A  https://www.wsj.com/business/telecom/baidu-ter...   
1   5:45 PM ET     N/A  https://www.wsj.com/opinion/military-extremist...   
2   5:45 PM ET     N/A  https://www.wsj.com/opinion/double-dipping-in-...   
3   5:43 PM ET     N/A  https://www.wsj.com/opinion/xi-jinping-says-ha...   
4   5:33 PM ET     N/A  https://www.wsj.com/world/middle-east/israel-r...   

            scraped_at  scanned_status  
0  2025-03-28 15:37:35               1 

Based on the literature review and exploratory analysis (see Exposé), several article categories were identified as irrelevant to the research objective. These are excluded in the following section.

In [40]:
# extract link column  
articles_links = df['link']

# extract the section names from the links
articles_sections = [x[19:40] for x in articles_links]
articles_sections = [x.split('/')[1] for x in articles_sections]

# add section names to the DataFrame
df['section'] = articles_sections

# extract unique section names
unique_sections = set(articles_sections)
print(unique_sections)

# create list of irrelevant sections for later removal
irrelevant_sections = ["health","arts-culture","lifestyle","real-estate","sports","livecoverage","personal-finance","video","science","style","articles","opinition"]

# investigate headlines
print(df[df["headline"].duplicated()])  # Check for duplicates in headlines
display(df[df["headline"].duplicated()])  # Display duplicates in headlines


{'real-estate', 'finance', 'opinion', 'us-news', 'sports', 'politics', 'science', 'video', 'world', 'arts-culture', 'health', 'tech', 'style', 'economy', 'livecoverage', 'lifestyle', 'articles', 'personal-finance', 'business'}
          id  year month day  \
179      180  2024     1   3   
272      273  2024     1   4   
370      371  2024     1   5   
397      398  2024     1   6   
527      528  2024     1   8   
...      ...   ...   ...  ..   
27482  27483  2024     1   2   
27483  27484  2024     1   2   
27484  27485  2024     1   2   
27485  27486  2024     1   2   
27486  27487  2024     1   2   

                                                headline article_time keyword  \
179                                    Pepper...and Salt  12:00 AM ET     N/A   
272                                    Pepper...and Salt  12:00 AM ET     N/A   
370                                    Pepper...and Salt  12:00 AM ET     N/A   
397                                    Pepper...and Salt  12:00 

Unnamed: 0,id,year,month,day,headline,article_time,keyword,link,scraped_at,scanned_status,section
179,180,2024,1,3,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/opinion/pepper-and-salt-10...,2025-03-28 15:38:09,1,opinion
272,273,2024,1,4,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/opinion/pepper-and-salt-25...,2025-03-28 15:38:26,1,opinion
370,371,2024,1,5,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/opinion/pepper-and-salt-96...,2025-03-28 15:38:43,1,opinion
397,398,2024,1,6,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/opinion/pepper-and-salt-a9...,2025-03-28 15:38:52,1,opinion
527,528,2024,1,8,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/opinion/pepper-and-salt-a5...,2025-03-28 15:39:25,1,opinion
...,...,...,...,...,...,...,...,...,...,...,...
27482,27483,2024,1,2,Pepper...and Salt,12:00 AM ET,,https://www.wsj.com/articles/pepper-and-salt-0...,2025-03-28 17:56:37,1,articles
27483,27484,2024,1,2,"Powerful Earthquake Hits Japan, Causing Tsunam...",11:59 PM ET,,https://www.wsj.com/world/asia/tsunami-warning...,2025-03-28 17:56:37,1,world
27484,27485,2024,1,2,Evergrande Auto’s Investment Agreement With Du...,9:26 PM ET,,https://www.wsj.com/business/autos/evergrande-...,2025-03-28 17:56:37,1,business
27485,27486,2024,1,2,The Formula to Get More Time Off Using Your Va...,9:00 PM ET,,https://www.wsj.com/lifestyle/careers/math-mak...,2025-03-28 17:56:37,1,lifestyle


In [41]:
# remove irrelevant sections from the DataFrame
df_filtered = df[~df['section'].isin(irrelevant_sections)].copy()

# remove salt and pepper noise from the headlines as manual investigation showed that they are not relevant for the analysis (only comics)
df_filtered = df_filtered[~df_filtered['headline'].str.contains(r'\b(salt|pepper)\b', case=False, na=False)]

# verify sections value counts after filtering
print(df_filtered['section'].value_counts())  # See what's left

# verify drop of duplicates in headlines
print(df_filtered[df_filtered["headline"].duplicated()])  

section
opinion     5438
business    4113
world       2548
finance     2086
politics    1534
economy     1114
tech        1103
us-news     1051
Name: count, dtype: int64
          id  year month day  \
6789    6790  2024     4   1   
7245    7246  2024     4   7   
7697    7698  2024     4  12   
8232    8233  2024     4  19   
9334    9335  2024     5   3   
...      ...   ...   ...  ..   
27478  27479  2024     1   2   
27479  27480  2024     1   2   
27483  27484  2024     1   2   
27484  27485  2024     1   2   
27486  27487  2024     1   2   

                                                headline article_time keyword  \
6789   Russian Court Extends Detention of Radio Free ...   4:10 PM ET     N/A   
7245   Week Ahead for FX, Bonds: Focus on U.S. Inflat...   6:40 PM ET     N/A   
7697                            Week Ahead for FX, Bonds  10:37 AM ET     N/A   
8232                            Week Ahead for FX, Bonds  11:59 AM ET     N/A   
9334                            Week Ahe

  df_filtered = df_filtered[~df_filtered['headline'].str.contains(r'\b(salt|pepper)\b', case=False, na=False)]


In [42]:
# check for misssing values
print(df_filtered.isnull().sum())  # Check for missing values

id                0
year              0
month             0
day               0
headline          0
article_time      0
keyword           0
link              0
scraped_at        0
scanned_status    0
section           0
dtype: int64


In [43]:
# Define path for new DB
cleaned_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articles_index_cleaned.db"

# Save DataFrame to a new SQLite database
conn = sqlite3.connect(cleaned_db_path)
df_filtered.to_sql("articles_index_cleaned", conn, if_exists="replace", index=False)
conn.close()

In [44]:
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("PRAGMA table_info(article)")
print(c.fetchall())
conn.close()


[(0, 'article_id', 'INTEGER', 0, None, 1), (1, 'image_src', 'TEXT', 0, None, 0), (2, 'scanned_time', 'TEXT', 0, None, 0), (3, 'title', 'TEXT', 0, None, 0), (4, 'sub_title', 'TEXT', 0, None, 0), (5, 'corpus', 'TEXT', 0, None, 0), (6, 'index_id', 'INTEGER', 0, None, 0)]


The article_ids from the cleaned index table are joined with the original articles table (containing the full text) to retain only relevant articles with available corpora. The original database is preserved to ensure reproducibility.

In [45]:
# Paths to your databases
cleaned_index_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articles_index_reduced.db"
original_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ.db"
final_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_cleaned.db"

# Load the cleaned articles_index table
print("Loading cleaned articles_index from cleaned_index_path...")
conn_cleaned = sqlite3.connect(cleaned_index_path)
df_cleaned_index = pd.read_sql_query("SELECT * FROM articles_index_cleaned", conn_cleaned)
conn_cleaned.close()
print(f"Loaded {len(df_cleaned_index)} cleaned index entries.")

# Write the cleaned index into the original DB temporarily
print("Attaching cleaned index to original database...")
conn_full = sqlite3.connect(original_db_path)
df_cleaned_index.to_sql("articles_index_cleaned", conn_full, if_exists="replace", index=False)

# Perform the join to filter articles based on valid index_ids
print("Joining article table with cleaned index on index_id...")
query = """
SELECT article.*, articles_index_cleaned.section, articles_index_cleaned.year, articles_index_cleaned.month, articles_index_cleaned.day
FROM article
JOIN articles_index_cleaned
ON article.index_id = articles_index_cleaned.id
"""
df_filtered_articles = pd.read_sql_query(query, conn_full)
conn_full.close()
print(f"Filtered down to {len(df_filtered_articles)} articles.")

# Save cleaned articles and index into final DB
print("Saving filtered article and cleaned articles_index into final database...")
conn_final = sqlite3.connect(final_db_path)
df_filtered_articles.to_sql("article", conn_final, if_exists="replace", index=False)
df_cleaned_index.to_sql("articles_index", conn_final, if_exists="replace", index=False)
conn_final.close()

print("Final cleaned database successfully created.")

Loading cleaned articles_index from cleaned_index_path...
Loaded 18987 cleaned index entries.
Attaching cleaned index to original database...
Joining article table with cleaned index on index_id...
Filtered down to 14306 articles.
Saving filtered article and cleaned articles_index into final database...
Final cleaned database successfully created.


A previous use of LEFT JOIN instead of INNER JOIN introduced entries without valid article_ids. These, along with articles that were scraped without a corpus, are removed in the following section to ensure data quality and consistency. Year, Month and Day column are merched to single Date column.

In [48]:
# load data from new database to verify
final_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_cleaned.db"
conn = sqlite3.connect(final_db_path)
df = pd.read_sql_query("SELECT * FROM article", conn)
c = conn.cursor()
conn.close()

In [None]:
# find duplicats in corpus
duplicates = df[df.duplicated(subset=['corpus'], keep=False)]
print("Duplicated corpus rows:\n", duplicates[['corpus', 'article_id']].head(10))
print("Number of duplicated corpus rows:", len(duplicates))

# drop duplicates
df = df.drop_duplicates(subset=['corpus'], keep='first')   

# verify that duplicates are removed
duplicates_after = df[df.duplicated(subset=['corpus'], keep=False)] 
print("Duplicated corpus rows after dropping duplicates:\n", duplicates_after[['corpus', 'article_id']].head(10))

Duplicated corpus rows:
 Empty DataFrame
Columns: [corpus, article_id]
Index: []
Number of duplicated corpus rows: 0
Duplicated corpus rows after dropping duplicates:
 Empty DataFrame
Columns: [corpus, article_id]
Index: []


In [65]:
# drop image_src
df = df.drop(columns=['image_src'])

In [51]:
# Drop rows where article_id is NULL
df_no_na = df.dropna(subset=["article_id"])

# verify drop
print("Number of rows after dropping rows with NULL article_id:", len(df_no_na[df_no_na['article_id'].isnull()]))

# check for any duplicates in the 'article_id' column
duplicates_article_id = df_no_na[df_no_na.duplicated(subset=['article_id'], keep=False)]
print("Duplicated article_id rows:\n", duplicates_article_id[['article_id', 'corpus']].head(10))

# check for duplicates in corpus column
duplicates_corpus = df_no_na[df_no_na.duplicated(subset=['corpus'], keep=False)]
print("Duplicated corpus rows:\n", duplicates_corpus[['corpus', 'article_id']].head(10))

# check for missing values in the 'corpus' column
missing_corpus = df_no_na[(df_no_na['corpus'].isnull()) | (df_no_na['corpus'] == '')]
print("Missing corpus rows:\n", missing_corpus[['article_id', 'corpus']].head(10))

Number of rows after dropping rows with NULL article_id: 0
Duplicated article_id rows:
 Empty DataFrame
Columns: [article_id, corpus]
Index: []
Duplicated corpus rows:
 Empty DataFrame
Columns: [corpus, article_id]
Index: []
Missing corpus rows:
 Empty DataFrame
Columns: [article_id, corpus]
Index: []


In [54]:
# merching year, month, day into a single date column
df_no_na['date'] = pd.to_datetime(df_no_na[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d')

# verify the new date column
print("Date column:\n", df_no_na[['year', 'month', 'day', 'date']].head(10))
print("Date column data type:", df_no_na['date'].dtype)

# drop the old columns
df_no_na = df_no_na.drop(columns=['year', 'month', 'day'])

Date column:
    year month day       date
0  2024     1   1 2024-01-01
1  2024     1   1 2024-01-01
2  2024     1   1 2024-01-01
3  2024     1   1 2024-01-01
4  2024     1   1 2024-01-01
5  2024     1   1 2024-01-01
6  2024     1   1 2024-01-01
7  2024     1   1 2024-01-01
8  2024     1   1 2024-01-01
9  2024     1   1 2024-01-01
Date column data type: datetime64[ns]


In [57]:
# use cleaner function to clean the corpus
cleaned_df = df_no_na.copy()
cleaned_df['cleaned_corpus'] = df_no_na['corpus'].apply(clean_article_text)

In [None]:
# verify corpus cleaning
print("Cleaned corpus:\n", cleaned_df[['article_id', 'cleaned_corpus']].head(10))
print("Number of cleaned corpus rows:", len(cleaned_df))

Cleaned corpus:
    article_id                                             corpus
0       13068  Advertisement\nBUSINESS\nTELECOM\nBaidu Termin...
1       13069  Advertisement\nOPINION\nREVIEW & OUTLOOK\nFoll...
2       13070  Advertisement\nOPINION\nREVIEW & OUTLOOK\nFoll...
3       13071  Advertisement\nOPINION\nREVIEW & OUTLOOK\nFoll...
4       13072  Israel Reshuffles Forces, Prepares for Long-Te...
5       13073  Advertisement\nWORLD\nMIDDLE EAST\nIsrael’s Hi...
6       13075  Advertisement\nOPINION\nMAIN STREET\nMaking Di...
7       13076  Advertisement\nOPINION\nCOMMENTARY\nFollow\nDo...
8       13077  Advertisement\nOPINION\nCOMMENTARY\nFollow\nBe...
9       13078  Advertisement\nOPINION\nCOMMENTARY\nFollow\nBi...
Number of cleaned corpus rows: 14258


In [66]:
# overrite the original DataFrame with the cleaned corpus
df_no_na['corpus'] = cleaned_df['cleaned_corpus']

# drop # drop image_src
df_no_na = df_no_na.drop(columns=['image_src'])

In [67]:
# Temporarily expand column width and disable truncation
pd.set_option('display.max_colwidth', None)

# Inspect full cleaned corpus
print("Cleaned corpus:\n", df_no_na['corpus'].iloc[:10])
print("column names:\n", df_no_na.columns)
print("DataFrame info:\n", df_no_na.info())

Cleaned corpus:
 0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [5]:
# connect to the final database
final_db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\articlesWSJ_cleaned.db"
conn = sqlite3.connect(final_db_path)

# Write the cleaned DataFrame to the 'article' table
df_no_na.to_sql('article', conn, if_exists='replace', index=False)

# Close the connection
conn.close()

NameError: name 'df_no_na' is not defined

In [4]:
# verify that the corpus is cleaned
conn = sqlite3.connect(final_db_path)
df = pd.read_sql_query("SELECT * FROM article", conn)
conn.close()

# column names of the DataFrame 
print("Column names in the DataFrame:\n", df.columns)

# cleaned corpus
print("Cleaned corpus:\n", df['corpus'].head(10))

# check for missing values in the DataFrame
print("Missing values in the DataFrame:\n", df.isnull().sum())

# number of rows in the DataFrame
print("Number of rows in the DataFrame:", len(df))

NameError: name 'final_db_path' is not defined

In [2]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('articlesWSJ_cleaned.db')
cursor = conn.cursor()

# Query to get all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Print the table names
for table in tables:
    print(table[0])

# Close the connection
conn.close()

articles_index_cleaned
articles_index
article
