In [1]:
import numpy as np
import pandas as pd

# Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

# CatBoost
from catboost import CatBoostRegressor

# RDKit
from rdkit import Chem
from rdkit.Chem import AllChem

# PyTorch Lightning
import torch
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl

# MoLeR wrapper
from molecule_generation.wrapper import load_model_from_directory

2025-04-20 19:48:00.469814: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# ========== Пути ==========
TRAIN_CSV = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/train_data_all_descriptors.csv"
TEST_CSV  = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/test_data_all_descriptors.csv"
MODEL_DIR = "/home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint"

In [3]:
# ========== 1. Загрузка и очистка ==========
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

# Удаляем столбцы с NaN и строки
na_cols = df_train.columns[df_train.isna().any()].tolist()
df_train.drop(columns=na_cols, inplace=True)
df_test .drop(columns=na_cols, inplace=True)
df_train.dropna(inplace=True)

# Уникальность по ID и SMILES
for df in (df_train, df_test):
    df.drop_duplicates(subset=['ID'], inplace=True)
    df.drop_duplicates(subset=['SMILES'], inplace=True)

In [4]:
# ========== 2. Табличные дескрипторы ==========
y = df_train['LogP'].values
X = df_train.drop(columns=['ID','SMILES','LogP','mol'], errors='ignore').select_dtypes(include=[np.number])
# Фильтрация
miss = X.isna().mean(); drop_m = miss[miss>0.5].index.tolist()
vari = X.var(); drop_v = vari[vari<1e-5].index.tolist()
corr = X.corr().abs(); upper = corr.where(np.triu(np.ones(corr.shape),1).astype(bool))
drop_c = [c for c in upper.columns if any(upper[c]>0.95)]
to_drop = set(drop_m + drop_v + drop_c)
X_filt = X.drop(columns=list(to_drop))

# Импутация и нормализация
imp = SimpleImputer(strategy='median')
X_desc = imp.fit_transform(X_filt)
sc  = StandardScaler()
X_desc = sc.fit_transform(X_desc)

# Табличные фичи для теста
X_test_tab = df_test.drop(columns=['ID','SMILES'], errors='ignore').select_dtypes(include=[np.number])
X_test_tab = X_test_tab.drop(columns=list(to_drop), errors='ignore')
X_test_tab = sc.transform(imp.transform(X_test_tab))


In [5]:
# ========== 3. Генерация SMILES-фич ==========
# MoLeR-эмбеддинги
def generate_moler_embeddings(smiles_list):
    with load_model_from_directory(MODEL_DIR, num_workers=8, beam_size=1) as moler:
        emb_list = moler.encode(smiles_list)
    return np.stack(emb_list)
smiles_train = df_train['SMILES'].tolist()
smiles_test  = df_test ['SMILES'].tolist()
X_emb = generate_moler_embeddings(smiles_train)
X_emb_te = generate_moler_embeddings(smiles_test)

# Morgan Fingerprints
def mol2fp(smiles_list, radius=2, nBits=2048):
    fps=[]
    for sm in smiles_list:
        m = Chem.MolFromSmiles(sm)
        v = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nBits)
        fps.append(np.array(v, dtype=float))
    return np.stack(fps)
X_fp    = mol2fp(smiles_train)
X_fp_te = mol2fp(smiles_test)

Loading a trained model from: /home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint/GNN_Edge_MLP_MoLeR__2022-02-24_07-16-23_best.pkl


2025-04-20 19:48:01.735838: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.751568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.755768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.755904: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.756345: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized wit

2025-04-20 19:48:01.829566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-20 19:48:01.830832: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.831091: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:01.832717: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be 

Loading a trained model from: /home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint/GNN_Edge_MLP_MoLeR__2022-02-24_07-16-23_best.pkl


2025-04-20 19:48:13.946948: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:13.959171: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:13.967751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:13.967890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-04-20 19:48:13.968291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized wit

In [8]:
# ========== 4. Lightning MLP ==========
class LitMLP(pl.LightningModule):
    def __init__(self, in_dim):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 512), torch.nn.ReLU(),
            torch.nn.Linear(512, 256), torch.nn.ReLU(),
            torch.nn.Linear(256, 1)
        )
        self.loss_fn = torch.nn.MSELoss()
    def forward(self, x):
        # Если передаётся список/кортеж, извлекаем первый элемент
        if isinstance(x, (tuple, list)):
            x = x[0]
        return self.net(x).squeeze(1)
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log('train_loss', loss)
        return loss
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log('val_loss', loss, prog_bar=True)
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    

In [9]:
# ========== 5. 5-Fold CV базовых моделей ==========
folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof_desc = np.zeros(len(X_desc))
oof_emb  = np.zeros(len(X_emb))
oof_fp   = np.zeros(len(X_fp))
pred_desc_test = np.zeros(len(df_test))
pred_emb_test  = np.zeros(len(df_test))
pred_fp_test   = np.zeros(len(df_test))

for i, (tr, va) in enumerate(folds.split(X_desc), 1):
    # Splits
    dtr, dva = X_desc[tr], X_desc[va]
    etr, eva = X_emb[tr],   X_emb[va]
    ftr, fva = X_fp[tr],    X_fp[va]
    ytr, yva = y[tr],       y[va]

    # CatBoost
    cb = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                           loss_function='RMSE', task_type='GPU', verbose=False)
    cb.fit(dtr, ytr, eval_set=(dva, yva), early_stopping_rounds=50)
    oof_desc[va] = cb.predict(dva)
    pred_desc_test += cb.predict(X_test_tab)

    # Lightning MLP on embeddings
    ds_tr = TensorDataset(torch.Tensor(etr), torch.Tensor(ytr))
    ds_va = TensorDataset(torch.Tensor(eva), torch.Tensor(yva))
    dl_tr = DataLoader(ds_tr, batch_size=64, shuffle=True)
    dl_va = DataLoader(ds_va, batch_size=64)
    model_e = LitMLP(X_emb.shape[1])
    trainer=pl.Trainer(max_epochs=30, logger=False, enable_checkpointing=False,
                       accelerator='gpu' if torch.cuda.is_available() else 'cpu', devices=1)
    trainer.fit(model_e, dl_tr, dl_va)
    preds_e = trainer.predict(model_e, dl_va)
    preds_e = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_e]).cpu().numpy()
    oof_emb[va] = preds_e
    # test
    test_dl_e = DataLoader(TensorDataset(torch.Tensor(X_emb_te)), batch_size=64)
    preds_te_e = trainer.predict(model_e, test_dl_e)
    preds_te_e = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_te_e]).cpu().numpy()
    pred_emb_test += preds_te_e

    # Lightning MLP on fingerprints
    ds_trf = TensorDataset(torch.Tensor(ftr), torch.Tensor(ytr))
    ds_vaf = TensorDataset(torch.Tensor(fva), torch.Tensor(yva))
    dl_trf = DataLoader(ds_trf, batch_size=64, shuffle=True)
    dl_vaf = DataLoader(ds_vaf, batch_size=64)
    model_f = LitMLP(X_fp.shape[1])
    trainer.fit(model_f, dl_trf, dl_vaf)
    preds_f = trainer.predict(model_f, dl_vaf)
    preds_f = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_f]).cpu().numpy()
    oof_fp[va] = preds_f
    # test
    test_dl_f = DataLoader(TensorDataset(torch.Tensor(X_fp_te)), batch_size=64)
    preds_te_f = trainer.predict(model_f, test_dl_f)
    preds_te_f = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_te_f]).cpu().numpy()
    pred_fp_test += preds_te_f

    print(f"Fold {i} RMSE desc={np.sqrt(mean_squared_error(yva,oof_desc[va])):.4f} "
          f"emb={np.sqrt(mean_squared_error(yva,oof_emb[va])):.4f} "
          f"fp={np.sqrt(mean_squared_error(yva,oof_fp[va])):.4f}")

# average test preds
pred_desc_test /= folds.get_n_splits()
pred_emb_test  /= folds.get_n_splits()
pred_fp_test   /= folds.get_n_splits()
print("OOF RMSE base:",
      np.sqrt(mean_squared_error(y, oof_desc)),
      np.sqrt(mean_squared_error(y, oof_emb)),
      np.sqrt(mean_squared_error(y, oof_fp)))

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 394 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
394 K     Trainable params
0         Non-trainable params
394 K     Total params
1.577     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 505.46it/s, val_loss=2.450]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 504.45it/s, val_loss=2.450]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1176.98it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1967.27it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 1.2 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.723     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

`Trainer.fit` stopped: `max_epochs=30` reached.




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1367.43it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1536.99it/s]
Fold 1 RMSE desc=1.3771 emb=1.5655 fp=4.1972


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 394 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
394 K     Trainable params
0         Non-trainable params
394 K     Total params
1.577     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 507.36it/s, val_loss=1.620]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 506.57it/s, val_loss=1.620]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1653.05it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1725.58it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 1.2 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.723     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

`Trainer.fit` stopped: `max_epochs=30` reached.




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1369.74it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1534.94it/s]
Fold 2 RMSE desc=1.2938 emb=1.2717 fp=4.1395


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 394 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
394 K     Trainable params
0         Non-trainable params
394 K     Total params
1.577     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 512.52it/s, val_loss=1.930]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 511.74it/s, val_loss=1.930]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1651.71it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1886.92it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 1.2 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.723     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

`Trainer.fit` stopped: `max_epochs=30` reached.




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1538.08it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1753.11it/s]
Fold 3 RMSE desc=1.3292 emb=1.3903 fp=4.2016


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 394 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
394 K     Trainable params
0         Non-trainable params
394 K     Total params
1.577     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 513.99it/s, val_loss=3.050]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 513.18it/s, val_loss=3.050]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1647.96it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1896.81it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 1.2 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.723     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                             

`Trainer.fit` stopped: `max_epochs=30` reached.




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1536.20it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1693.17it/s]
Fold 4 RMSE desc=1.6620 emb=1.7462 fp=4.1488


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 394 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
394 K     Trainable params
0         Non-trainable params
394 K     Total params
1.577     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 455.98it/s, val_loss=2.770]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 137/137 [00:00<00:00, 455.21it/s, val_loss=2.770]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/oleg28/anaconda3/envs/moler-env/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1484.92it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1682.08it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | net     | Sequential | 1.2 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.723     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

`Trainer.fit` stopped: `max_epochs=30` reached.




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 35/35 [00:00<00:00, 1409.09it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 42/42 [00:00<00:00, 1588.49it/s]
Fold 5 RMSE desc=1.4884 emb=1.6646 fp=4.2537
OOF RMSE base: 1.4362543236947456 1.5375651476415355 4.1883648770841875


In [10]:
# ========== 6. Стэкинг ==========
X_stack_tr = np.vstack([oof_desc, oof_emb, oof_fp]).T
X_stack_te = np.vstack([pred_desc_test, pred_emb_test, pred_fp_test]).T
meta = Ridge()
meta.fit(X_stack_tr, y)
oof_meta = meta.predict(X_stack_tr)
print("OOF RMSE meta:", np.sqrt(mean_squared_error(y, oof_meta)))

# Финальный сабмит
submission = pd.DataFrame({'ID': df_test['ID'], 'LogP': meta.predict(X_stack_te)})
submission.to_csv('submission_stacked.csv', index=False)
print("Готов файл submission_stacked.csv")

OOF RMSE meta: 1.4025015491716006
Готов файл submission_stacked.csv


In [11]:
# 1. Очищенные исходные данные
df_train.to_csv("train_clean.csv", index=False)
df_test .to_csv("test_clean.csv",  index=False)

# 2. Отфильтрованные табличные дескрипторы
X_filt_df = pd.DataFrame(X_filt, columns=X_filt.columns)
X_filt_df.to_csv("X_filtered_descriptors.csv", index=False)

# 3. Препроцессированные (импутация + масштабирование)
X_desc_df = pd.DataFrame(X_desc, columns=X_filt.columns)
X_desc_df.to_csv("X_desc_scaled.csv", index=False)

# 4. MoLeR‑эмбеддинги и фингерпринты
np.save("X_emb.npy",    X_emb)
np.save("X_emb_te.npy", X_emb_te)
np.save("X_fp.npy",     X_fp)
np.save("X_fp_te.npy",  X_fp_te)