In [None]:
import itertools
from importlib import reload

import numpy as np
import polars as pl
from numpy.typing import NDArray

from src import data_util, util_agg
from src import evt_embeddings as ee
from src import plotlyplot as pp

reload(pp)
# reload(util_agg)

pp.set_plotly_template()


data_conf = data_util.load_config()
print(data_conf)
df, evt_types = data_util.load_data(data_conf)
evt_counts = data_util.unique_counts(df, col="type_name")
print(evt_counts)
display(df.tail())
evt_counts = data_util.unique_counts(df, col="type_name")

df_evt_agg, df_topk = util_agg.agg_duration_per_day(df)
X = df_evt_agg.select(evt_types).to_numpy()


## Try all variants of embedding pipeline


In [None]:
reload(ee)

# UMAP is a bit slow
# variants = (["std", "tfidf"], ["pca", "umap", "isomap", "lle", "tsne"])
variants = (["std", "tfidf"], ["pca", "isomap", "lle", "tsne"])


pipes_evt = {
    f"{inp_trans}_{red}": ee.make_embedding_pipe(inp_trans, red, transp_between=True)  # pyright: ignore[reportArgumentType]
    for inp_trans, red in itertools.product(*variants)
}

print(*pipes_evt, sep="\n")

In [None]:
evt_embs: dict[str, NDArray] = {}

for name, pipe in pipes_evt.items():
    evt_embs[name] = pipe.fit_transform(X)

    pp.scatter_embs(evt_embs[name], texts=evt_types).update_layout(
        title=f"Event embedding ({name})"
    ).show()

## Subjective evaluation


In [None]:
# some things should be more similar than others are
# can make triplets (anchor, positive, negative)
# and check if the computed ranking matches


reload(ee)
reload(data_util)
COMP_METRIC = "cosine"
# COMP_METRIC = "L2"

# Load all triples, except those containing ignored types
subj_triples = data_util.load_subj_triples(ignore=data_conf.ignore_evts)
display(subj_triples.head())

print(f"evaluating on {len(subj_triples)} triples\n")
acc_comp_metric = {}
for name, embs in evt_embs.items():
    print(name.ljust(20), end="")
    acc = ee.eval_subj(
        embs,
        evt_types,
        subj_triples,
        ("cosine", "L2"),
        verbose=True,
        n_bootstrap=5,
    )
    acc_comp = acc[COMP_METRIC]
    acc_comp_metric[name] = (
        acc_comp.mean() if isinstance(acc_comp, np.ndarray) else acc_comp
    )

# Pick winner based on metric
winner_name, winner_acc = sorted(acc_comp_metric.items(), key=lambda a: -a[1])[0]
print(f"Winner ({COMP_METRIC}): {winner_name} ({winner_acc:.0%})")

pp.scatter_embs(evt_embs[winner_name], texts=evt_types).update_layout(
    title=f"Event embedding ({winner_name})"
).show()

In [None]:
reload(ee)
reload(pp)
MODE = "std_isomap"

day_embs = ee.day_linear_comb(X, evt_embs[MODE])
df_cal = util_agg.add_week_calendar_cols(df_evt_agg)
df_cal = df_cal.with_columns(pl.Series("color", pp.vecs2color(day_embs)))
evt_colors = dict(zip(evt_types, pp.vecs2color(evt_embs[MODE]), strict=True))

show_evts = set("walk,bike,lunch,dinner,driving,work,project,study".split(","))

show_names = [t if t in show_evts else "" for t in evt_types]
show_mask = [t in show_evts for t in evt_types]

reload(pp)
fig = pp.cal_and_embs(
    df_cal,
    show_names,
    evt_embs[MODE],
    filter_expr=pl.col("date").dt.week().is_between(16, 34),
    width=700,
    mode=MODE,
)
fig.show(config={"displayModeBar": False})
fig.write_image(f"media/cal_emb_{MODE}.png", scale=2.0)