In [None]:
import polars as pl
from datetime import date
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

FILE = "../data/filtered/04_no_null_articles.parquet"

In [None]:
lf = pl.scan_parquet(FILE)
lf = lf.with_columns(
    pl
    .col("date")
    .cast(pl.Date)
    .alias("date")
)
lf = (
    lf
    .group_by(pl.col("date"))
    .agg(
        pl.col("url").count().alias("num_urls"),
        pl.col("title").count().alias("num_titles"),
        pl.col("article").count().alias("num_articles"),
    )
    .sort("date")
)
date_agg_df = lf.collect()
date_agg_df

In [None]:
plt.figure(figsize=(14,5))
fig, ax = plt.subplots()
ax.plot(date_agg_df["date"].to_numpy(), date_agg_df["num_urls"].to_numpy())
plt.show()

In [None]:
def filter_outliers(df: pl.DataFrame, col: str) -> pl.DataFrame:
    q1: float = df[col].quantile(0.25, "nearest")
    q3: float = df[col].quantile(0.75, "nearest")

    iqr = q3 - q1

    # Define upper bound for outliers
    upper_bound = q3 + 1.5 * iqr

    # Filter out high outliers
    return df.filter(pl.col(col) <= upper_bound)

no_outliers_df = filter_outliers(date_agg_df, "num_urls")

plt.figure(figsize=(14,5))
fig, ax = plt.subplots()
ax.scatter(no_outliers_df["date"].to_numpy(), no_outliers_df["num_urls"].to_numpy())
plt.show()

In [None]:
no_outliers_df = filter_outliers(date_agg_df, "num_articles")

plt.figure(figsize=(14,5))
fig, ax = plt.subplots()
ax.scatter(no_outliers_df["date"].to_numpy(), no_outliers_df["num_articles"].to_numpy())
plt.show()

In [None]:
before2010 = date_agg_df.filter(pl.col("date") < date(2010, 1, 1))

fig, ax = plt.subplots()

months = mdates.MonthLocator(bymonth=[7, 12])
months_fmt = mdates.DateFormatter("%Y-%m")

ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(months_fmt)

plt.xticks(rotation=45)
ax.scatter(before2010["date"].to_numpy(), before2010["num_articles"].to_numpy())
plt.show()

In [None]:
before2010_no_outliers = filter_outliers(before2010, "num_articles")

fig, ax = plt.subplots()

months = mdates.MonthLocator(bymonth=[7, 12])
months_fmt = mdates.DateFormatter("%Y-%m")

ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(months_fmt)

plt.xticks(rotation=45)
ax.scatter(before2010_no_outliers["date"].to_numpy(), before2010_no_outliers["num_articles"].to_numpy())
plt.show()

In [None]:
lf = pl.scan_parquet(FILE)
lf = lf.with_columns(
    pl
    .col("date")
    .cast(pl.Date)
    .dt.truncate("1mo")
    .alias("date")
)
lf = lf.filter(pl.col("date") < date(2010, 1, 1))
lf = (
    lf
    .group_by(pl.col("date"))
    .agg(
        pl.col("url").count().alias("num_urls"),
        pl.col("title").count().alias("num_titles"),
        pl.col("article").count().alias("num_articles"),
    )
    .sort("date")
)
lf.head().collect()

In [None]:
df = lf.collect()

In [None]:
fig, ax = plt.subplots()

months = mdates.MonthLocator(bymonth=[7, 12])
months_fmt = mdates.DateFormatter("%Y-%m")

ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(months_fmt)

plt.xticks(rotation=45)
ax.plot(df["date"].to_numpy(), df["num_articles"].to_numpy())
plt.show()

In [None]:
no_outliers_df = filter_outliers(df, "num_articles")

fig, ax = plt.subplots()

months = mdates.MonthLocator(bymonth=[7, 12])
months_fmt = mdates.DateFormatter("%Y-%m")

ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(months_fmt)

plt.xticks(rotation=45)
ax.plot(no_outliers_df["date"].to_numpy(), no_outliers_df["num_articles"].to_numpy())
plt.show()

# Summary Statistics

This section generates some summary statistics of the dataset,
such as number of documents, words, and sentences.

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")

In [None]:
lf = pl.scan_parquet(FILE)

In [None]:
lf.select([
    pl.col("date").min().alias("start_date"),
    pl.col("date").max().alias("end_date")
]).collect()

In [None]:
# Number of documents
lf.select("article").count().collect()

In [None]:
def count_tokens(s: str) -> int:
    words = word_tokenize(s)
    return len(words)

def count_sentences(s: str) -> int:
    sentences = sent_tokenize(s)
    return len(sentences)

In [None]:
df = lf.select("article").collect()

In [None]:
df = df.with_columns(
    pl.col("article")
    .map_elements(count_tokens, return_dtype=pl.Int64)
    .alias("n_tokens")
)

In [None]:
df = df.with_columns(
    pl.col("article")
    .map_elements(count_sentences, return_dtype=pl.Int64)
    .alias("n_sentences")
)

In [None]:
df.write_parquet("../data/processed/tokenized_news.parquet")

In [None]:
df = pl.read_parquet("../data/processed/tokenized_news.parquet")
df

In [None]:
print("Average tokens:", df["n_tokens"].mean())
print("Average sentences:", df["n_sentences"].mean())

In [None]:
print("Total tokens:", df["n_tokens"].sum())
print("Total sentences:", df["n_sentences"].sum())