In [None]:
from dotenv import dotenv_values
import polars as pl
import pathlib
from datetime import date
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

EARLIEST = "1999-12-21"
NO_RUSSIANS = "../data/filtered/03_no_russians.parquet"

In [None]:
config: dict[str, str] = dotenv_values("../.env")
data_dir = pathlib.Path( "../" + config["DATA_DIR"])
lf = pl.scan_csv(data_dir / "raw/nasdaq_exteral_data.csv")
lf = lf.with_columns(
    pl.col("Date")
    .str.replace(" UTC", "")  # remove " UTC"
    .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
    .alias("parsed_date")
)

In [None]:
lf.head().collect()

In [None]:
lf = lf.with_columns(
    pl.col("Unnamed: 0").cast(pl.Int64, strict=False).alias("row_number")
)
rename_cols = {
    "parsed_date": "date",
    "Article_title": "title",
    "Article": "article",
    "Url": "url",
    "Publisher": "publisher",
    "Author": "author"
}
select_columns_lf = (
    lf
    .select(["row_number", "parsed_date", "Article_title", "Article", "Url", "Publisher", "Author"])
    .rename(rename_cols)
)
select_columns_lf.head().collect()

In [None]:
select_columns_lf.sink_parquet("../data/filtered/01_relevant_cols.parquet")

In [None]:
lf = pl.scan_parquet("../data/filtered/01_relevant_cols.parquet")
limit_date = date.fromisoformat(EARLIEST)
date_filtered_lf = lf.filter(
    pl.col("date") >= pl.datetime(limit_date.year, limit_date.month, limit_date.day, 0, 0, 0)
)
sorted_lf = (
    date_filtered_lf
    .sort("date")
)
sorted_lf.head().collect()

In [None]:
date_filtered_lf.sink_parquet("../data/filtered/02_1999-2023.parquet")

In [None]:
lf = pl.scan_parquet("../data/filtered/02_1999-2023.parquet")
authors = lf.select("author").collect()
authors

In [None]:
publishers = lf.select("publisher").collect()["publisher"]
publishers = set(publishers)
publishers

In [None]:
lf = pl.scan_parquet("../data/filtered/03_english_only.parquet")
lf = lf.with_columns(
    pl.col("title").str.to_lowercase().alias("lowercase_title")
)
lf.head().collect()

In [None]:
mentions_fed_lf = lf.filter(
    pl.col("lowercase_title").str.contains("federal reserve")
)

In [None]:
def mentions_count(lf: pl.LazyFrame, s: str) -> pl.DataFrame:
    lf = lf.filter(
        pl.col("lowercase_title").str.contains(s)
    )
    return lf.count().collect()

mentions_count(lf, "inflation")

In [None]:
not_null_article_lf = lf.filter(
    pl.col("article").is_not_null()
)
not_null_article_lf.count().collect()

In [None]:
lf = pl.scan_parquet("../data/filtered/02_1999-2023.parquet")
unique_publishers = lf.select("publisher").unique().collect()
print(len(unique_publishers))
for p in unique_publishers["publisher"]:
    print(p)

In [None]:
unique_authors = lf.select("author").unique().collect()
print(len(unique_authors))
for a in unique_authors["author"]:
    print(a)

In [None]:
cyrillic_pattern = re.compile(r'[\u0400-\u04FF]')

def contains_cyrillic(s: str) -> bool:
    if s:
        return bool(cyrillic_pattern.search(s))
    else:
        return False

In [None]:
lf = pl.scan_parquet("../data/filtered/02_1999-2023.parquet")
no_russians_lf = lf.filter(
    ~pl.col("title").map_elements(contains_cyrillic, return_dtype=pl.Boolean)
)
no_russians_lf.sink_parquet("../data/filtered/03_no_russians.parquet")

In [None]:
lf = pl.scan_parquet("../data/filtered/02_1999-2023.parquet")
print(lf.count().collect())
lf = pl.scan_parquet("../data/filtered/03_no_russians.parquet")
print(lf.count().collect())

In [None]:
lf = pl.scan_parquet("../data/filtered/03_no_russians.parquet")
unique_publishers = lf.select("publisher").unique().collect()
print(len(unique_publishers))
for p in unique_publishers["publisher"]:
    print(p)

In [None]:
lf = pl.scan_parquet(NO_RUSSIANS)
no_null_articles_lf = lf.filter(pl.col("article").is_not_null())
no_null_articles_lf.sink_parquet("../data/filtered/04_no_null_articles.parquet")