In [None]:
import numpy as np
import pandas as pd

import random
import pickle

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import MeanSquaredError

from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
pl.seed_everything(SEED, workers=True)

In [None]:
DATA_DIR   = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data"
N_FOLDS    = 5
BATCH_SIZE = 256
MAX_EPOCHS = 300
EMB_PCA_DIM = 128          # <= 512
HIDDEN     = (1024, 768, 512, 256)
DROP_P     = 0.40
LR_MAX     = 5e-4
WD         = 3e-3
LABEL_NOISE_STD = 0.05
HUBER_DELTA = 1.0 

In [None]:
df_emb  = pd.read_pickle(f"{DATA_DIR}/df_with_embeddings.pkl")

df_desc = (
    pd.read_csv(f"{DATA_DIR}/train_data_all_descriptors.csv")
      .drop(columns=["ID", "mol"])
)

df = pd.merge(df_emb, df_desc, on=["SMILES", "LogP"])
df

In [None]:
emb = np.vstack(df["embedding"]).astype("float32")                 # (n, 512)
desc = df.drop(columns=["SMILES", "LogP", "embedding"]).values     # (n, 103)
y = df["LogP"].values.astype("float32")


full_feat_dim = EMB_PCA_DIM + desc.shape[1]

In [None]:
class MLP(pl.LightningModule):
    def __init__(self, in_dim):
        super().__init__()
        dims, layers = (in_dim, *HIDDEN), []
        for i in range(len(dims)-1):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.LayerNorm(dims[i+1]),
                nn.GELU(),
                nn.Dropout(DROP_P)
            ]
        layers.append(nn.Linear(dims[-1], 1))
        self.net  = nn.Sequential(*layers)
        self.huber = nn.SmoothL1Loss(beta=HUBER_DELTA)
        self.rmse  = MeanSquaredError(squared=False)

    def forward(self, x): return self.net(x).squeeze(1)

    def _step(self, batch, tag):
        x, y = batch
        if tag == "train" and LABEL_NOISE_STD > 0:
            y = y + torch.randn_like(y)*LABEL_NOISE_STD
        y_hat = self(x)
        loss  = self.huber(y_hat, y)
        rmse  = self.rmse(y_hat, y)
        self.log_dict({f"{tag}_loss": loss, f"{tag}_rmse": rmse}, prog_bar=True)
        return loss
    def training_step  (self,b,_): return self._step(b,"train")
    def validation_step(self,b,_):  self._step(b,"val")

    def configure_optimizers(self):
        opt  = torch.optim.AdamW(self.parameters(), lr=LR_MAX, weight_decay=WD)
        sched1 = torch.optim.lr_scheduler.OneCycleLR(
            opt, max_lr=LR_MAX, total_steps=self.trainer.estimated_stepping_batches
        )
        sched2 = torch.optim.lr_scheduler.ReduceLROnPlateau(
            opt, mode="min", factor=0.5, patience=5, min_lr=1e-5
        )
        return ([opt],
                [{"scheduler": sched1, "interval": "step"},
                 {"scheduler": sched2, "interval": "epoch", "monitor": "val_rmse"}])

In [None]:
kf = KFold(N_FOLDS, shuffle=True, random_state=SEED)
oof_pred = np.zeros_like(y)
fold_rmse = []

In [None]:
for fold, (tr_idx, vl_idx) in enumerate(kf.split(emb), 1):
    emb_tr, emb_vl = emb[tr_idx], emb[vl_idx]
    desc_tr, desc_vl = desc[tr_idx], desc[vl_idx]
    y_tr, y_vl = y[tr_idx], y[vl_idx]

    # ---- PCA fit on train embeddings ----
    pca = PCA(n_components=EMB_PCA_DIM, svd_solver="full", random_state=SEED)
    emb_tr_pca = pca.fit_transform(emb_tr).astype("float32")
    emb_vl_pca = pca.transform(emb_vl).astype("float32")
    explained = pca.explained_variance_ratio_.sum()
    print(f"[Fold {fold}]  PCA variance kept: {explained:.3%}")

    X_tr = np.hstack([emb_tr_pca, desc_tr])
    X_vl = np.hstack([emb_vl_pca, desc_vl])

    scaler = StandardScaler().fit(X_tr)
    X_tr = scaler.transform(X_tr).astype("float32")
    X_vl = scaler.transform(X_vl).astype("float32")

    ds_tr = TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr))
    ds_vl = TensorDataset(torch.from_numpy(X_vl), torch.from_numpy(y_vl))
    dl_tr = DataLoader(ds_tr, BATCH_SIZE, True,  num_workers=4, pin_memory=True)
    dl_vl = DataLoader(ds_vl, BATCH_SIZE, False, num_workers=4, pin_memory=True)

    model = MLP(full_feat_dim)

    trainer = pl.Trainer(
        max_epochs=MAX_EPOCHS,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices=1,
        precision=32,
        callbacks=[
            EarlyStopping("val_rmse", patience=20, mode="min"),
            ModelCheckpoint(save_top_k=1, monitor="val_rmse", mode="min")
        ],
        log_every_n_steps=25,
    )
    trainer.fit(model, dl_tr, dl_vl)

    best = MLP.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path,
        in_dim=full_feat_dim,
        map_location=torch.device("cpu")
    )
    best.eval()
    with torch.no_grad():
        oof_pred[vl_idx] = best(torch.from_numpy(X_vl)).numpy()

    rmse_fold = float(np.sqrt(((oof_pred[vl_idx] - y_vl) ** 2).mean()))
    fold_rmse.append(rmse_fold)
    print(f"Fold {fold} RMSE: {rmse_fold:.4f}")

In [None]:
print("\nOOF RMSE:", np.sqrt(((oof_pred - y) ** 2).mean()).round(4))