In [None]:
import sqlite3
import os   
import pandas as pd

# import text_cleaner
from text_cleaner_WSJ import clean_article_text

In [None]:
#  Set Base Directory (Relative to Repository Root)
repo_root = os.getcwd()
wsj_base = os.path.join(repo_root, "WSJ")

# Change to 2023 or 2024
year = 2024  

# Define database names
db_filenames = {
    2023: "articlesWSJ_2023.db",
    2024: "articlesWSJ_2024.db"
}

# verify the year and database file
if year not in db_filenames:
    raise ValueError(f"Unsupported year: {year}. Supported years: {list(db_filenames.keys())}")

db_path = os.path.join(wsj_base, str(year), "WSJ_DB", db_filenames[year])

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database not found at: {db_path}")

print(f"Connecting to database (relative path): {os.path.relpath(db_path, repo_root)}")
conn = sqlite3.connect(db_path)

In [None]:
# load the articles from the database
df = pd.read_sql_query("SELECT * FROM article;", conn)
conn.close()
print(f"Loaded {len(df)} articles.")

In [None]:
# find duplicats in corpus
duplicates = df[df.duplicated(subset=['corpus'], keep=False)]
print("Duplicated corpus rows:\n", duplicates[['corpus', 'article_id']].head(10))
print("Number of duplicated corpus rows:", len(duplicates))

# drop duplicates
df = df.drop_duplicates(subset=['corpus'], keep='first')   

# verify that duplicates are removed
duplicates_after = df[df.duplicated(subset=['corpus'], keep=False)] 
print("Duplicated corpus rows after dropping duplicates:\n", duplicates_after[['corpus', 'article_id']].head(10))

In [None]:
# merching year, month, day into a single date column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d')

# verify the new date column
print("Date column:\n", df[['year', 'month', 'day', 'date']].head(10))
print("Date column data type:", df['date'].dtype)

# drop the old columns
df = df.drop(columns=['year', 'month', 'day'])

In [None]:
# apply cleaner function
df['cleaned_corpus'] = df['corpus'].apply(lambda x: clean_article_text(x))

# verify that the function worked
print("Sample cleaned corpus:\n", df['cleaned_corpus'].head(10))

In [None]:
# load article_index from database
conn = sqlite3.connect(db_path)
df_index = pd.read_sql_query("SELECT * FROM article_index;", conn)
conn.close()

# verify the loaded index
print("Loaded article_index with columns:\n", df_index.columns)
m

In [None]:
# Define Cleaned Output Databases
clean_filenames = {
    2023: "articlesWSJ_clean_final_2023.db",
    2024: "articlesWSJ_clean_final_2024.db"
}

clean_db_path = os.path.join(wsj_base, str(year), "WSJ_DB", clean_filenames[year])

print(f"Saving cleaned data to (relative path): {os.path.relpath(clean_db_path, repo_root)}")

with sqlite3.connect(clean_db_path) as clean_conn:
    df.to_sql("article", clean_conn, if_exists="replace", index=False)
    df_index.to_sql("articles_index", clean_conn, if_exists="replace", index=False) 
print("✅ Cleaned data saved successfully.")