In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from apopfail.model import clean
from torchinfo import summary

sns.set_style("whitegrid")

sklearn.set_config(transform_output="pandas")

np.random.seed(0)

In [None]:
X = pd.read_parquet("../data/train_set_p53mutant.parquet")
y = pd.read_csv(
    "../data/train_labels_p53mutant.csv", index_col=0, skiprows=1, names=["target"]
)["target"].map({"inactive": 0, "active": 1})

In [None]:
X, y = clean(X, y)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
X = SimpleImputer(strategy="mean").fit_transform(X)
X = StandardScaler().fit_transform(X)

In [None]:
X_normal = X[y == 0]
X_abnormal = X[y == 1]

X_normal = X_normal.astype("float32").to_numpy()

In [None]:
# needs pip install git+https://github.com/MoritzM00/drcomp.git
from drcomp.autoencoder import FullyConnectedAE
from drcomp.reducers import AutoEncoder
from torch import nn

In [None]:
from skorch.callbacks import EarlyStopping, LRScheduler

In [None]:
ae = FullyConnectedAE(
    input_size=X.shape[1],
    hidden_layer_dims=[256, 128, 64],
    intrinsic_dim=32,
    include_batch_norm=True,
    encoder_act_fn=nn.ReLU,
    decoder_act_fn=nn.ReLU,
)
scheduler = LRScheduler(policy="ExponentialLR", gamma=0.96)
early_stopping = EarlyStopping(monitor="valid_loss", patience=20, load_best=True)
model = AutoEncoder(
    ae,
    max_epochs=200,
    device="cuda",
    lr=1e-1,
    callbacks=[early_stopping, scheduler],
    n_jobs=3,
).fit(X_normal)

In [None]:
summary(ae)

In [None]:
metrics_ae = model.evaluate(X_normal, max_K=30)

In [None]:
# kernel pca
from drcomp import estimate_intrinsic_dimension
from drcomp.reducers import PCA, KernelPCA

intrinsic_dim = estimate_intrinsic_dimension(X_normal)
print(f"{intrinsic_dim=}")

kpca = KernelPCA(intrinsic_dim=intrinsic_dim, kernel="poly")
pca = PCA(intrinsic_dim=intrinsic_dim)

X_kpca = kpca.fit_transform(X_normal)
X_pca = pca.fit_transform(X_normal)

In [None]:
metrics_pca = pca.evaluate(X_normal, X_pca, max_K=30)
metrics_kpca = kpca.evaluate(X_normal, X_kpca, max_K=30)

In [None]:
from drcomp.plotting import compare_metrics

In [None]:
compare_metrics({"AE": metrics_ae, "PCA": metrics_pca, "KernelPCA": metrics_kpca})