In [None]:
from pathlib import Path

import lightgbm as lgb
import numpy as np
import polars as pl
from sklearn import linear_model
from sklearn.decomposition import PCA

## load day-vectors
fp = Path("tmp_data/agg_day.parquet")
df = pl.read_parquet(fp)
display(df.head())

In [None]:
## data


class Dataset:
    def __init__(
        self,
        df: pl.DataFrame,
        train_ratio: float = 0.7,
        shuffle_init: bool = True,
        target: str = "weekday",
        seed: int | None = None,
    ) -> None:
        # validation
        assert "date" in df.columns
        self.features = df.drop("date").columns

        # optionally shuffle
        if shuffle_init:
            df = df.sample(len(df), shuffle=True, seed=seed)
        self.shuffle_init = shuffle_init

        if target == "weekday":
            self.df = df.with_columns(label=pl.col("date").dt.weekday())
        else:
            raise ValueError(f"unknown target: {target}")

        self.ntrain = int(train_ratio * len(df))

    def __len__(self):
        return len(self.df)

    def __str__(self) -> str:
        nval = len(self) - self.ntrain
        s = f"dataset: {len(self)} records ({self.ntrain} train, {nval} val)"

        if self.shuffle_init:
            s += " (shuffled)"
        return s

    def preprocessed(self, normalize=True, pca_dim: int | None = None):
        # split
        dfs = {
            "train": self.df.slice(0, self.ntrain),
            "val": self.df.slice(self.ntrain),
        }
        Xs = {k: dfs[k].select(self.features).to_numpy() for k in dfs}
        Ys = {k: dfs[k]["label"].to_numpy() for k in dfs}

        if normalize:
            xmin, xmax = Xs["train"].min().item(), Xs["train"].max().item()
            print(f"Normalize (Xtrain-> {xmin=}, {xmax=})")
            Xs = {k: (Xs[k] - xmin) / xmax for k in Xs}

        if pca_dim is not None:
            pca = PCA(pca_dim)
            pca.fit(Xs["train"])
            Xs = {k: pca.transform(Xs[k]) for k in Xs}

        return {k: (Xs[k], Ys[k]) for k in dfs}


data = Dataset(df, seed=1337)
print(data)

splits = data.preprocessed(pca_dim=24)
for s, (x, y) in splits.items():
    print(f"{s}:  {x.shape}, {y.shape}")
print(splits["train"][0].mean())
print(splits["train"][1].mean())


# LogReg


In [None]:
clf = linear_model.LogisticRegressionCV(Cs=list(np.linspace(0.1, 10, 7)))
clf.fit(*splits["train"])

for k in splits:
    acc = clf.score(*splits[k])
    print(f"score({k})= {acc:.1%}")

## LGBM


In [None]:
# prediction task
ld_train = lgb.Dataset(*splits["train"], feature_name=data.features)
ld_val = lgb.Dataset(*splits["val"], reference=ld_train, feature_name=data.features)
# model
params = {
    "num_leaves": 2,
    "max_depth": 2,
    "reg_alpha": 0.5,
    "reg_lambda": 0.1,
    "n_estimators": 50,
}
clf = lgb.LGBMClassifier(force_row_wise=True, **params)  # pyright: ignore[reportGeneralTypeIssues]
clf.fit(*splits["train"])

# preds = {k: clf.predict(splits[k][0]) for k in splits}  # pyright: ignore[reportAttributeAccessIssue]

for k in splits:
    acc = clf.score(*splits[k])  # pyright: ignore[reportAttributeAccessIssue]
    print(f"score({k})= {acc:.1%}")