This notebook's purpose is to serve as a first look into the FNSPID dataset.

In [None]:
import polars as pl

FILE = "../data/raw/nasdaq_exteral_data.csv"

# Must be scanned cause the dataset is just too big
lf = pl.scan_csv(FILE)

In [None]:
df = lf.limit(5).collect()
df

In [None]:
df = lf.limit(100).collect()
df

In [None]:
df = lf.first().collect()
df["Url"][0]

In [None]:
from functools import reduce

exclude = ["nasdaq.com", "benzinga", "lenta.ru"]
condition = reduce(
    lambda acc, s: acc | pl.col("Url").str.contains(s, literal=True),
    exclude[1:],  # start from second
    pl.col("Url").str.contains(exclude[0], literal=True)  # initial
)

sample = (
    lf
    .filter(~condition)
    .limit(100)
    .collect()
)
sample

In [None]:
domains = (
    lf
    .select(
        pl.col("Url")
        .str.extract(r"https?://(?:www\.)?([^/]+)", 1)  # grab domain
        .alias("domain")
    )
    .unique()
    .collect()
)

print(domains)

In [None]:
for d in domains["domain"]:
    print(d)

In [None]:
lf = lf.with_columns(
    pl.col("Date")
    .str.replace(" UTC", "")  # remove " UTC"
    .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
    .alias("parsed_date")
)

filtered_lf = lf.filter(
    pl.col("parsed_date") >= pl.datetime(2000, 1, 1, 0, 0, 0)
)

sorted_lf = (
    filtered_lf
    .sort("parsed_date")  # ascending = oldest first
    .limit(10)
    .collect()
)

print(sorted_lf)