In [None]:
import polars as pl
import pathlib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt

NEWS_OUTPUT_PATH = "../output/stm_news_2025-06-14_14-11-59_topics_13"
FED_OUTPUT_PATH = "../output/stm_fed_14_2025-06-13_10-32-17"
FIGURE_DIR = "../figures"

news_path = pathlib.Path(NEWS_OUTPUT_PATH)
fed_path = pathlib.Path(FED_OUTPUT_PATH)
fig_path = pathlib.Path(FIGURE_DIR)

In [2]:
FED_TOPICS = [
    "Monetary Policy Implementation",
    "Foreign Exchange",
    "Bernanke Admin",
    "JPowell Admin",
    "Coronavirus",
    "Coronavirus + Vaccine",
    "Economic Recovery",
    "Strong Economy",
    "Great Financial Crisis",
    "Great Financial Crisis & TALF",
    "Market Operations",
    "Tight Economy",
    "Construction & Real Estate",
    "Fed Forecasts"
]

NEWS_TOPICS = [
    "Energy & Dividend Stocks",
    "Consumer-facing Companies",
    "Economic Commentary and Predictions",
    "Investment Strategies",
    "Trading Platforms & Technical Indicators",
    "LEveraged Instruments & ETFs",
    "International Politics",
    "Press Releases",
    "Transcripts & CEO Commentary",
    "Stock Options & Derivatives",
    "Biotech",
    "Earnings Season & Analyst Forecasts",
    "Big Tech & AI"
]

In [8]:
def generate_embeddings(beta: pl.DataFrame, topic_labels: list[str], model: str) -> pl.DataFrame:
    if model == "pca":
        pca = PCA(n_components=2)
        coords = pca.fit_transform(beta.to_numpy())
    elif model == "tsne":
        tsne = TSNE(n_components=2, perplexity=5)
        coords = tsne.fit_transform(beta.to_numpy())
    else:
        raise ValueError("You must select a model")

    df = pd.DataFrame(coords, columns=["x", "y"])
    df["topic"] = topic_labels
    return pl.DataFrame(df).select(["topic", "x", "y"])

fed_beta = pl.read_parquet(fed_path / "topic_dist.parquet")
news_beta = pl.read_parquet(news_path / "topic_dist.parquet")

fed_pca_df = generate_embeddings(fed_beta, topic_labels=FED_TOPICS, model="pca")
fed_tsne_df = generate_embeddings(fed_beta, topic_labels=FED_TOPICS, model="tsne")
news_pca_df = generate_embeddings(news_beta, topic_labels=NEWS_TOPICS, model="pca")
news_tsne_df = generate_embeddings(news_beta, topic_labels=NEWS_TOPICS, model="tsne")

fed_pca_df.write_csv(fed_path / "pca_embeddings.csv")
fed_tsne_df.write_csv(fed_path / "tsne_embeddings.csv")
news_pca_df.write_csv(news_path / "pca_embeddings.csv")
news_tsne_df.write_csv(news_path / "tsne_embeddings.csv")

In [9]:
def plot_embeddings(df: pl.DataFrame) -> alt.LayerChart:
    chart = alt.Chart(df).mark_circle(size=100).encode(
        x='x',
        y='y',
        tooltip=['topic'],
        color=alt.Color('topic', legend=None),
    ).properties(
        width=600,
        height=500,
    ).interactive()

    text = alt.Chart(df).mark_text(
        align='left',
        baseline='middle',
        dx=7
    ).encode(
        x='x',
        y='y',
        text='topic'
    )
    return (chart + text).configure_view(clip=False)

chart = plot_embeddings(fed_tsne_df)
chart.save(fig_path / "fed_tsne.svg")

chart = plot_embeddings(fed_pca_df)
chart.save(fig_path / "fed_pca.svg")

chart = plot_embeddings(news_pca_df)
chart.save(fig_path / "news_pca.svg")

chart = plot_embeddings(news_tsne_df)
chart.save(fig_path / "news_tsne.svg")