In [None]:
import numpy as np
import torch
from apopfail.model import clean
from apopfail.utils.loading import load_data
from sklearn import set_config
from sklearn.model_selection import train_test_split

torch.set_float32_matmul_precision("high")
set_config(transform_output="pandas")

In [None]:
X, _, y = load_data(root="..")
X, y = clean(X, y)
# X = get_pipeline(reducer="passthrough", scaler=StandardScaler()).fit_transform(X)
X = X.astype(np.float32)

In [None]:
data = X.join(y)
data = data.dropna()
train, test = train_test_split(
    data, test_size=0.2, stratify=data["target"], random_state=42
)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=0)

# X_train, y_train = SMOTE(sampling_strategy=0.5, random_state=0).fit_resample(X_train.reshape(-1, dim*dim), y_train)
# X_train = X_train.reshape(-1, dim, dim)

In [None]:
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)
from pytorch_tabular.models import CategoryEmbeddingModelConfig

data_config = DataConfig(
    target=[
        "target"
    ],  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=[str(i) for i in range(train.shape[1] - 1)],
    categorical_cols=[],
    num_workers=23,
)
trainer_config = TrainerConfig(
    batch_size=128,
    max_epochs=100,
)
optimizer_config = OptimizerConfig()
model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="256-128-64",
    learning_rate=0.001,
    activation="LeakyReLU",
    metrics=["f1_score", "precision", "recall"],
)

In [None]:
from pytorch_tabular import TabularModel

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=True,
)

In [None]:
tabular_model.fit(train=train)

In [None]:
tabular_model.evaluate(test)

In [None]:
y_pred = tabular_model.predict(test)

In [None]:
y_pred.mean()

In [None]:
test["target"].mean()