In [None]:
import polars as pl
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")

In [None]:
df = pl.read_csv("../data/processed/communications.csv")
df

In [None]:
def count_tokens(s: str) -> int:
    words = word_tokenize(s)
    return len(words)

def count_sentences(s: str) -> int:
    sentences = sent_tokenize(s)
    return len(sentences)

In [None]:
df = df.with_columns(
    pl.col("Text")
    .map_elements(count_tokens, return_dtype=pl.Int64)
    .alias("n_tokens")
)

In [None]:
df = df.with_columns(
    pl.col("Text")
    .map_elements(count_sentences, return_dtype=pl.Int64)
    .alias("n_sentences")
)

In [None]:
print("Average tokens:", df["n_tokens"].mean())
print("Average sentences:", df["n_sentences"].mean())

In [None]:
print("Total tokens:", df["n_tokens"].sum())
print("Total sentences:", df["n_sentences"].sum())

# Publishers

In [None]:
import tldextract

lf = pl.scan_parquet("../data/filtered/04_no_null_articles.parquet")
df = lf.select("url").collect()
df

In [None]:
def extract_domain(url: str) -> str | None:
    if not url:
        return None
    ext = tldextract.extract(url)
    if ext.domain and ext.suffix:
        return f"{ext.domain}.{ext.suffix}"
    else:
        return None

In [None]:
df = df.with_columns(
    pl.col("url").map_elements(extract_domain, return_dtype=pl.String).alias("domain")
)
df

In [None]:
domain_counts = (
    df
    .group_by("domain")
    .agg(pl.col("url").count().alias("n_articles"))
    .sort("n_articles", descending=True)
)
domain_counts

In [None]:
lf = pl.scan_parquet("../data/filtered/04_no_null_articles.parquet")
lf = lf.filter(pl.col("url").is_null())
df = lf.collect()

In [None]:
df = df.with_columns(
    pl.col("article").map_elements(extract_domain, return_dtype=pl.String).alias("domain")
)
df.select("domain").unique()