In [None]:
import datetime
from importlib import reload
from itertools import product
from pathlib import Path

import numpy as np
import polars as pl
from numpy.typing import NDArray

from src import data_util
from src import evt_embeddings as ee
from src import plotlyplot as pp
from src import util_agg as util

reload(pp)
reload(util)
reload(data_util)

pp.set_plotly_template()


data_conf = data_util.load_config()
print(data_conf)
df, evt_types = data_util.load_data(data_conf)
evt_counts = data_util.unique_counts(df, col="type_name")
print(evt_counts)
display(df.tail())


In [None]:
all_dates: list[datetime.date] = df["date"].unique().to_list()
print(f"{len(all_dates) = } | {all_dates = }")


In [None]:
df_agg, df_topk = util.agg_duration_per_day(df)
display(df_agg.head(3))
display(df_topk.head(3))

# remember agg df
fp = Path("tmp_data/agg_day.parquet")
fp.parent.mkdir(exist_ok=True)
df_agg.write_parquet(fp)


In [None]:
reload(ee)

variants = (["std", "tfidf"], ["pca", "umap"])

pipes_day = {
    f"{inp_trans}_{red}": ee.make_embedding_pipe(inp_trans, red)  # pyright: ignore[reportArgumentType]
    for inp_trans, red in product(*variants)
}
pipes_evt = {
    f"{inp_trans}_{red}": ee.make_embedding_pipe(inp_trans, red, transp_between=True)  # pyright: ignore[reportArgumentType]
    for inp_trans, red in product(*variants)
}

print(*pipes_evt, sep="\n")

## Day embeddings


In [None]:
assert df_agg["date"].is_sorted()
X = df_agg.drop("date").to_numpy()

day_embs: dict[str, NDArray] = {}
for name, pipe in pipes_day.items():
    day_embs[name] = pipe.fit_transform(X)
    pp.scatter_embs(
        day_embs[name],
        [str(d) for d in df_agg["date"] + ": " + df_topk["topk"].list.join(", ")],
    ).update_layout(title=f"Day embedding ({name})").show()

<!-- ## Event embeddings -->


## Event embeddings


In [None]:
evt_embs: dict[str, NDArray] = {}

for name, pipe in pipes_evt.items():
    evt_embs[name] = pipe.fit_transform(X)
    print(f"{name}. embs:{evt_embs[name].shape}")

    pp.scatter_embs(evt_embs[name], texts=df_agg.columns[1:]).update_layout(
        title=f"Event embedding ({name})"
    ).show()

## Load CBOW embeddings


In [None]:
with np.load(Path("tmp_data/embs_cbow.npz")) as arrdata:
    vocab_cbow = arrdata["vocab"].tolist()
    embs_cbow = arrdata["embs"]

assert vocab_cbow == evt_types, "expects same vocab"

evt_embs.update(
    {
        "cbow_pca": ee.make_embedding_pipe("std", "pca").fit_transform(embs_cbow),
        "cbow_umap": ee.make_embedding_pipe("std", "umap").fit_transform(embs_cbow),
    }
)
for name, emb in evt_embs.items():
    if name.startswith("cbow"):
        pp.scatter_embs(emb, texts=vocab_cbow).update_layout(
            title=f"Event embedding (CBOW, {name})"
        ).show()

In [None]:
# save all embeddings
np.savez(
    "tmp_data/evt_embs_all.npz", allow_pickle=True, evt_types=evt_types, **evt_embs
)
np.savez(
    "tmp_data/day_embs_all.npz",
    allow_pickle=True,
    dates=[d.isoformat() for d in all_dates],
    **day_embs,
)

# calendar like


In [None]:
MODE = "tfidf_umap"
MODE = "tfidf_pca"


print(f"day embeddings: {day_embs.keys()}")

df_cal = util.add_week_calendar_cols(df_agg)
df_cal = df_cal.with_columns(pl.Series("color", pp.vecs2color(day_embs[MODE])))
reload(pp)

pp.weeks_cal_grid(
    df_cal,
    df_topk,
    width=280,
    filter_expr=pl.col("date").dt.week().is_between(19, 31),
).update_layout(title=f"Calendar ({MODE} colors)", margin_t=40).show(
    config={"displayModeBar": False}
)

## decompose to event colors

- We aggregate $n$ events, giving $w^{(k)}_1,...,w^{(k)}_n$ for day $k$.
- Each day we embed $\mathbf{w}^{(k)}\to \mathbf{y}^{(k)}\in\mathbb{R}^{d}$.
- Consider it as a linear combination $\mathbf{y}^{(k)} = \sum_i^n w^{(k)}_i\mathbf{x}_i$, where $\mathbf{x}_i$ represents event $i$.
- each day has $d$ equations, and $n\cdot d$ unknowns.
- There is a optimal LS solution as long as we have $>n$ days of data

At the same time, since PCA is linear we don't need to do that, we can just transpose the data


## Long term overview


In [None]:
from datetime import timedelta

pp.date_ts_agg_events(
    df_agg=util.agg_durations_periodic(df_agg, evt_types, timedelta(weeks=4)),
    # type names are all cols except first ("date")
    evt_colors=dict(
        zip(df_agg.columns[1:], pp.vecs2color(evt_embs[MODE]), strict=True)
    ),
    # most common overall
    show_types=list(evt_counts.keys())[:10],
).show()

## Similarity search


In [None]:
reload(ee)

top_events = list(evt_counts.keys())[:20]
_ = ee.find_all_closest(
    evt_embs[MODE],
    df_agg.columns[1:],
    top_events,
    metric="cosine",
    top_count=4,
    verbose=True,
)


# subjective evaluation


In [None]:
# some things should be more similar than others are
# can make triplets (anchor, positive, negative)
# and check if the computed ranking matches
reload(ee)
try:
    subj_triples = pl.read_csv(
        "aux_data/subjective_triples.csv",
        has_header=False,
        new_columns=["anchor", "p", "n"],
    )
    display(subj_triples.head())

    print(f"evaluating on {len(subj_triples)} triples\n")
    for name, embs in evt_embs.items():
        print(name.ljust(15), end="")
        acc = ee.eval_subj(
            embs,
            df_agg.columns[1:],
            subj_triples,
            ("cosine", "L2"),
            verbose=True,
            n_bootstrap=5,
        )
        # print(acc)
except FileNotFoundError as err:
    print(err)