In [None]:
import sys
from importlib import reload
from pathlib import Path

import lightgbm as lgb
import numpy as np
import polars as pl
from sklearn.metrics import confusion_matrix

sys.path.append("src")
import evt_embeddings as ee
import plotlyplot as pp
import util_agg

pp.set_plotly_template()

## load day-vectors
fp = Path("tmp_data/agg_day.parquet")
df_evt_agg = pl.read_parquet(fp)
evt_types = df_evt_agg.columns[1:]
display(df_evt_agg.head())

In [None]:
df_health = pl.read_csv(
    "aux_data/health_daily.csv", infer_schema_length=None, try_parse_dates=True
)

df = df_evt_agg.join(df_health, on="date", how="left", suffix="_HD")

In [None]:
reload(ee)
reload(util_agg)

TARGET = "c"
data = util_agg.DailyEvtAggDataset(df, features=evt_types, seed=1337)
print(data)

pipes_day = {v: ee.make_embedding_pipe(v, None) for v in ["std", "tfidf", "tfidf+std"]}  # pyright: ignore[reportArgumentType]
splits = data.preprocessed_splits(pipes_day["tfidf"], TARGET)
for s, (x, y) in splits.items():
    print(f"{s}:  {x.shape}, {y.shape}")
print(splits["train"][0].mean())


In [None]:
# The dataset is unbalanced. To give the model a reasonable
# chance to outperform simply predicting the label mode
# we can use class weights, based on label frequency
# or, use the class_weight = "balanced" in LGBM.

ytrain = splits["train"][1]
uc = np.unique_counts(ytrain)
clw = {la.item(): float(1 / c) for la, c in zip(uc.values, uc.counts)}
print(clw)

## LGBM


In [None]:
import warnings
from typing import Any

warnings.filterwarnings("ignore", category=UserWarning)
# prediction task
params: dict[str, Any] = {
    "num_leaves": 2,
    "max_depth": 1,
    # "reg_alpha": 0.9,
    # "reg_lambda": 0.9,
    "n_estimators": 100,
    # "class_weight": "balanced",
}
clf = lgb.LGBMClassifier(force_row_wise=True, verbosity=-1, **params)  # pyright: ignore[reportGeneralTypeIssues]
clf.fit(*splits["train"])

# preds = {k: clf.predict(splits[k][0]) for k in splits}  # pyright: ignore[reportAttributeAccessIssue]

for k in splits:
    acc = clf.score(*splits[k])  # pyright: ignore[reportAttributeAccessIssue]
    print(f"score({k})= {acc:.1%}")

In [None]:
pred = {k: clf.predict(Xk) for k, (Xk, _) in splits.items()}
ytrue = {k: Yk for k, (_, Yk) in splits.items()}


In [None]:
reload(pp)
for k in splits.keys():
    pk, yk = pred[k], ytrue[k]
    print(
        f"{k} acc: {np.average(pk == yk):.1%}",
    )
    cm = confusion_matrix(yk, pk)  # type: ignore
    pp.heatmap(cm, logscale=False).show()