In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from apopfail.model import clean
from torchinfo import summary

sns.set_style("whitegrid")

sklearn.set_config(transform_output="pandas")

np.random.seed(0)

In [None]:
X = pd.read_parquet("../data/train_set_p53mutant.parquet")
y = pd.read_csv(
    "../data/train_labels_p53mutant.csv", index_col=0, skiprows=1, names=["target"]
)["target"].map({"inactive": 0, "active": 1})

In [None]:
X, y = clean(X, y)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
X = SimpleImputer(strategy="mean").fit_transform(X)
X = StandardScaler().fit_transform(X)

In [None]:
X_normal = X[y == 0]
X_abnormal = X[y == 1]

X_normal = X_normal.astype("float32").to_numpy()

In [None]:
# needs pip install git+https://github.com/MoritzM00/drcomp.git
from drcomp.autoencoder import FullyConnectedAE
from drcomp.reducers import AutoEncoder
from torch import nn
from torch.optim.lr_scheduler import OneCycleLR

In [None]:
from skorch.callbacks import EarlyStopping, LRScheduler

In [None]:
ae = FullyConnectedAE(
    input_size=X.shape[1],
    hidden_layer_dims=[256, 64],
    intrinsic_dim=32,
    include_batch_norm=True,
    encoder_act_fn=nn.ReLU,
    decoder_act_fn=nn.Identity,
)
early_stopping = EarlyStopping(monitor="train_loss", patience=20, load_best=True)

batch_size = 128
max_epochs = 200
inital_lr = 1e-3

scheduler = LRScheduler(policy=OneCycleLR, max_lr=0.1, total_steps=max_epochs)
model = AutoEncoder(
    ae,
    max_epochs=max_epochs,
    device="cuda",
    lr=inital_lr,
    batch_size=batch_size,
    callbacks=[scheduler],
    n_jobs=2,
)
model.train_split = None
model.fit(X_normal)

In [None]:
# sns.lineplot(model.history[:, "valid_loss"], label="valid loss")
sns.lineplot(model.history[:, "train_loss"], label="train loss")

In [None]:
sns.lineplot(model.history[:, "event_lr"], label="lr")

In [None]:
summary(ae)

In [None]:
X_subset = pd.DataFrame(X_normal).sample(5000)

In [None]:
metrics_ae = model.evaluate(X_subset.to_numpy(), max_K=30)

In [None]:
# kernel pca
from drcomp.reducers import PCA

pca = PCA(intrinsic_dim=32)

X_pca = pca.fit_transform(X_normal)

In [None]:
metrics_pca = pca.evaluate(X_subset, max_K=30)

In [None]:
from drcomp.plotting import compare_metrics

In [None]:
compare_metrics({"AE": metrics_ae, "PCA": metrics_pca})