In [130]:
import numpy as np
import pandas as pd

import random
import pickle

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import MeanSquaredError

from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

In [131]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
pl.seed_everything(SEED, workers=True)

Seed set to 42


42

In [132]:
DATA_DIR   = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data"
N_FOLDS    = 5
BATCH_SIZE = 256
MAX_EPOCHS = 300
EMB_PCA_DIM = 128          # <= 512
HIDDEN     = (1024, 768, 512, 256)
DROP_P     = 0.40
LR_MAX     = 5e-4
WD         = 3e-3
LABEL_NOISE_STD = 0.05
HUBER_DELTA = 1.0 

In [133]:
df_emb  = pd.read_pickle(f"{DATA_DIR}/df_with_embeddings.pkl")

df_desc = (
    pd.read_csv(f"{DATA_DIR}/train_data_all_descriptors.csv")
      .drop(columns=["ID", "mol"])
)

df = pd.merge(df_emb, df_desc, on=["SMILES", "LogP"])
df

Unnamed: 0,SMILES,LogP,embedding,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,FpDensityMorgan1,...,fr_NH2,fr_Nhpyrrole,fr_allylic_oxid,fr_amide,fr_benzene,fr_bicyclic,fr_halogen,fr_imidazole,fr_priamide,fr_unbrch_alkane
0,C1(NON=C1C2=CC=CC=C2)=N,3.093,"[0.020701513, 0.010955291, 0.038745042, -0.054...",0.195556,0.658376,161.164,154.108,161.058912,60,1.250000,...,0,1,0,0,1,0,0,0,0,0
1,C=1C=CC=CC=1CC(NC=2C=CC(=CC=2)Br)=O,5.245,"[-0.113395296, 0.17722493, 0.0033778166, -0.16...",-0.001368,0.918521,290.160,278.064,289.010226,86,1.000000,...,0,0,0,1,2,0,1,0,0,0
2,C(C)C1=CC=CS1,4.294,"[0.0038519718, -0.00040961266, -0.11154813, 0....",1.178241,0.522665,112.197,104.133,112.034671,38,1.714286,...,0,0,0,0,0,0,0,0,0,0
3,N1C=CNC1=NC2C(OC)=NC(=NC2Cl)C,2.254,"[0.048929833, -0.068438314, 0.03529113, 0.0211...",-0.496157,0.549765,241.682,229.586,241.073038,86,1.437500,...,0,2,0,0,0,0,1,1,0,0
4,CC(C)CCO,1.939,"[0.011915015, 0.033852004, -0.055152994, 0.024...",0.331019,0.534089,88.150,76.054,88.088815,38,1.500000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12948,C=1N=CC(=CC=1)CNCC,2.497,"[0.08328996, -0.1480503, -0.18738534, -0.03468...",0.920972,0.674806,136.198,124.102,136.100048,54,1.600000,...,0,0,0,0,0,0,0,0,0,0
12949,CC(C)COC(=O)NC1=CC=C(OCC)C(=C1)OCC,4.704,"[0.04149374, -0.12154684, 0.038277026, -0.0348...",-0.467307,0.827794,281.352,258.168,281.162708,112,1.150000,...,0,0,0,1,1,0,0,0,0,0
12950,C(O)(COC(C(C)=C)=O)C,2.566,"[-0.010112917, -0.014311982, -0.012961797, -0....",-0.608056,0.462790,144.170,132.074,144.078644,58,1.800000,...,0,0,0,0,0,0,0,0,0,0
12951,O=S(=O)(C=1C(C(OC)=O)=CC=CC=1)NC(N=C2N=C(C)NC(...,3.694,"[-0.05270728, 0.019474091, -0.01440288, -0.026...",-4.379930,0.696733,381.370,366.250,381.074304,138,1.192308,...,0,1,0,2,1,0,0,0,0,0


In [134]:
emb = np.vstack(df["embedding"]).astype("float32")                 # (n, 512)
desc = df.drop(columns=["SMILES", "LogP", "embedding"]).values     # (n, 103)
y = df["LogP"].values.astype("float32")


full_feat_dim = EMB_PCA_DIM + desc.shape[1]

In [124]:
class MLP(pl.LightningModule):
    def __init__(self, in_dim):
        super().__init__()
        dims, layers = (in_dim, *HIDDEN), []
        for i in range(len(dims)-1):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.LayerNorm(dims[i+1]),
                nn.GELU(),
                nn.Dropout(DROP_P)
            ]
        layers.append(nn.Linear(dims[-1], 1))
        self.net  = nn.Sequential(*layers)
        self.huber = nn.SmoothL1Loss(beta=HUBER_DELTA)
        self.rmse  = MeanSquaredError(squared=False)

    def forward(self, x): return self.net(x).squeeze(1)

    def _step(self, batch, tag):
        x, y = batch
        if tag == "train" and LABEL_NOISE_STD > 0:
            y = y + torch.randn_like(y)*LABEL_NOISE_STD
        y_hat = self(x)
        loss  = self.huber(y_hat, y)
        rmse  = self.rmse(y_hat, y)
        self.log_dict({f"{tag}_loss": loss, f"{tag}_rmse": rmse}, prog_bar=True)
        return loss
    def training_step  (self,b,_): return self._step(b,"train")
    def validation_step(self,b,_):  self._step(b,"val")

    def configure_optimizers(self):
        opt  = torch.optim.AdamW(self.parameters(), lr=LR_MAX, weight_decay=WD)
        sched1 = torch.optim.lr_scheduler.OneCycleLR(
            opt, max_lr=LR_MAX, total_steps=self.trainer.estimated_stepping_batches
        )
        sched2 = torch.optim.lr_scheduler.ReduceLROnPlateau(
            opt, mode="min", factor=0.5, patience=5, min_lr=1e-5
        )
        return ([opt],
                [{"scheduler": sched1, "interval": "step"},
                 {"scheduler": sched2, "interval": "epoch", "monitor": "val_rmse"}])

In [125]:
kf = KFold(N_FOLDS, shuffle=True, random_state=SEED)
oof_pred = np.zeros_like(y)
fold_rmse = []

In [128]:
for fold, (tr_idx, vl_idx) in enumerate(kf.split(emb), 1):
    emb_tr, emb_vl = emb[tr_idx], emb[vl_idx]
    desc_tr, desc_vl = desc[tr_idx], desc[vl_idx]
    y_tr, y_vl = y[tr_idx], y[vl_idx]

    # ---- PCA fit on train embeddings ----
    pca = PCA(n_components=EMB_PCA_DIM, svd_solver="full", random_state=SEED)
    emb_tr_pca = pca.fit_transform(emb_tr).astype("float32")
    emb_vl_pca = pca.transform(emb_vl).astype("float32")
    explained = pca.explained_variance_ratio_.sum()
    print(f"[Fold {fold}]  PCA variance kept: {explained:.3%}")

    X_tr = np.hstack([emb_tr_pca, desc_tr])
    X_vl = np.hstack([emb_vl_pca, desc_vl])

    scaler = StandardScaler().fit(X_tr)
    X_tr = scaler.transform(X_tr).astype("float32")
    X_vl = scaler.transform(X_vl).astype("float32")

    ds_tr = TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr))
    ds_vl = TensorDataset(torch.from_numpy(X_vl), torch.from_numpy(y_vl))
    dl_tr = DataLoader(ds_tr, BATCH_SIZE, True,  num_workers=4, pin_memory=True)
    dl_vl = DataLoader(ds_vl, BATCH_SIZE, False, num_workers=4, pin_memory=True)

    model = MLP(full_feat_dim)

    trainer = pl.Trainer(
        max_epochs=MAX_EPOCHS,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices=1,
        precision=32,
        callbacks=[
            EarlyStopping("val_rmse", patience=20, mode="min"),
            ModelCheckpoint(save_top_k=1, monitor="val_rmse", mode="min")
        ],
        log_every_n_steps=25,
    )
    trainer.fit(model, dl_tr, dl_vl)

    best = MLP.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path,
        in_dim=full_feat_dim,
        map_location=torch.device("cpu")
    )
    best.eval()
    with torch.no_grad():
        oof_pred[vl_idx] = best(torch.from_numpy(X_vl)).numpy()

    rmse_fold = float(np.sqrt(((oof_pred[vl_idx] - y_vl) ** 2).mean()))
    fold_rmse.append(rmse_fold)
    print(f"Fold {fold} RMSE: {rmse_fold:.4f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


[Fold 1]  PCA variance kept: 99.957%



  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | net   | Sequential       | 1.6 M  | train
1 | huber | SmoothL1Loss     | 0      | train
2 | rmse  | MeanSquaredError | 0      | train
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.213     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 99: 100%|██████████| 41/41 [00:00<00:00, 65.89it/s, v_num=15, train_loss=0.249, train_rmse=0.860, val_loss=0.456, val_rmse=1.530] 
Fold 1 RMSE: 1.5949
[Fold 2]  PCA variance kept: 99.958%


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | net   | Sequential       | 1.6 M  | train
1 | huber | SmoothL1Loss     | 0      | train
2 | rmse  | MeanSquaredError | 0      | train
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.213     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 91: 100%|██████████| 41/41 [00:00<00:00, 65.00it/s, v_num=16, train_loss=0.296, train_rmse=0.914, val_loss=0.389, val_rmse=1.210] 
Fold 2 RMSE: 1.1954
[Fold 3]  PCA variance kept: 99.957%


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | net   | Sequential       | 1.6 M  | train
1 | huber | SmoothL1Loss     | 0      | train
2 | rmse  | MeanSquaredError | 0      | train
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.213     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 94: 100%|██████████| 41/41 [00:00<00:00, 64.29it/s, v_num=17, train_loss=0.328, train_rmse=1.030, val_loss=0.455, val_rmse=1.420] 
Fold 3 RMSE: 1.4522
[Fold 4]  PCA variance kept: 99.958%


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | net   | Sequential       | 1.6 M  | train
1 | huber | SmoothL1Loss     | 0      | train
2 | rmse  | MeanSquaredError | 0      | train
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.213     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 76: 100%|██████████| 41/41 [00:00<00:00, 69.33it/s, v_num=18, train_loss=0.359, train_rmse=1.070, val_loss=0.426, val_rmse=1.360] 
Fold 4 RMSE: 1.3578
[Fold 5]  PCA variance kept: 99.958%


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | net   | Sequential       | 1.6 M  | train
1 | huber | SmoothL1Loss     | 0      | train
2 | rmse  | MeanSquaredError | 0      | train
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.213     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 104: 100%|██████████| 41/41 [00:00<00:00, 63.35it/s, v_num=19, train_loss=0.273, train_rmse=0.898, val_loss=0.422, val_rmse=1.360] 
Fold 5 RMSE: 1.3792


In [129]:
print("\nOOF RMSE:", np.sqrt(((oof_pred - y) ** 2).mean()).round(4))


OOF RMSE: 1.402
