In [None]:
import numpy as np
import pandas as pd

# Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

# CatBoost
from catboost import CatBoostRegressor

# RDKit
from rdkit import Chem
from rdkit.Chem import AllChem

# PyTorch Lightning
import torch
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl

# MoLeR wrapper
from molecule_generation.wrapper import load_model_from_directory

In [None]:
# ========== Пути ==========
TRAIN_CSV = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/train_data_all_descriptors.csv"
TEST_CSV  = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/test_data_all_descriptors.csv"
MODEL_DIR = "/home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint"

In [None]:
# ========== 1. Загрузка и очистка ==========
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

# Удаляем столбцы с NaN и строки
na_cols = df_train.columns[df_train.isna().any()].tolist()
df_train.drop(columns=na_cols, inplace=True)
df_test .drop(columns=na_cols, inplace=True)
df_train.dropna(inplace=True)

# Уникальность по ID и SMILES
for df in (df_train, df_test):
    df.drop_duplicates(subset=['ID'], inplace=True)
    df.drop_duplicates(subset=['SMILES'], inplace=True)

In [None]:
# ========== 2. Табличные дескрипторы ==========
y = df_train['LogP'].values
X = df_train.drop(columns=['ID','SMILES','LogP','mol'], errors='ignore').select_dtypes(include=[np.number])
# Фильтрация
miss = X.isna().mean(); drop_m = miss[miss>0.5].index.tolist()
vari = X.var(); drop_v = vari[vari<1e-5].index.tolist()
corr = X.corr().abs(); upper = corr.where(np.triu(np.ones(corr.shape),1).astype(bool))
drop_c = [c for c in upper.columns if any(upper[c]>0.95)]
to_drop = set(drop_m + drop_v + drop_c)
X_filt = X.drop(columns=list(to_drop))

# Импутация и нормализация
imp = SimpleImputer(strategy='median')
X_desc = imp.fit_transform(X_filt)
sc  = StandardScaler()
X_desc = sc.fit_transform(X_desc)

# Табличные фичи для теста
X_test_tab = df_test.drop(columns=['ID','SMILES'], errors='ignore').select_dtypes(include=[np.number])
X_test_tab = X_test_tab.drop(columns=list(to_drop), errors='ignore')
X_test_tab = sc.transform(imp.transform(X_test_tab))


In [None]:
# ========== 3. Генерация SMILES-фич ==========
# MoLeR-эмбеддинги
def generate_moler_embeddings(smiles_list):
    with load_model_from_directory(MODEL_DIR, num_workers=8, beam_size=1) as moler:
        emb_list = moler.encode(smiles_list)
    return np.stack(emb_list)
smiles_train = df_train['SMILES'].tolist()
smiles_test  = df_test ['SMILES'].tolist()
X_emb = generate_moler_embeddings(smiles_train)
X_emb_te = generate_moler_embeddings(smiles_test)

# Morgan Fingerprints
def mol2fp(smiles_list, radius=2, nBits=2048):
    fps=[]
    for sm in smiles_list:
        m = Chem.MolFromSmiles(sm)
        v = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nBits)
        fps.append(np.array(v, dtype=float))
    return np.stack(fps)
X_fp    = mol2fp(smiles_train)
X_fp_te = mol2fp(smiles_test)

In [None]:
# ========== 4. Lightning MLP ==========
class LitMLP(pl.LightningModule):
    def __init__(self, in_dim):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 512), torch.nn.ReLU(),
            torch.nn.Linear(512, 256), torch.nn.ReLU(),
            torch.nn.Linear(256, 1)
        )
        self.loss_fn = torch.nn.MSELoss()
    def forward(self, x):
        # Если передаётся список/кортеж, извлекаем первый элемент
        if isinstance(x, (tuple, list)):
            x = x[0]
        return self.net(x).squeeze(1)
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log('train_loss', loss)
        return loss
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log('val_loss', loss, prog_bar=True)
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    

In [None]:
# ========== 5. 5-Fold CV базовых моделей ==========
folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof_desc = np.zeros(len(X_desc))
oof_emb  = np.zeros(len(X_emb))
oof_fp   = np.zeros(len(X_fp))
pred_desc_test = np.zeros(len(df_test))
pred_emb_test  = np.zeros(len(df_test))
pred_fp_test   = np.zeros(len(df_test))

for i, (tr, va) in enumerate(folds.split(X_desc), 1):
    # Splits
    dtr, dva = X_desc[tr], X_desc[va]
    etr, eva = X_emb[tr],   X_emb[va]
    ftr, fva = X_fp[tr],    X_fp[va]
    ytr, yva = y[tr],       y[va]

    # CatBoost
    cb = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                           loss_function='RMSE', task_type='GPU', verbose=False)
    cb.fit(dtr, ytr, eval_set=(dva, yva), early_stopping_rounds=50)
    oof_desc[va] = cb.predict(dva)
    pred_desc_test += cb.predict(X_test_tab)

    # Lightning MLP on embeddings
    ds_tr = TensorDataset(torch.Tensor(etr), torch.Tensor(ytr))
    ds_va = TensorDataset(torch.Tensor(eva), torch.Tensor(yva))
    dl_tr = DataLoader(ds_tr, batch_size=64, shuffle=True)
    dl_va = DataLoader(ds_va, batch_size=64)
    model_e = LitMLP(X_emb.shape[1])
    trainer=pl.Trainer(max_epochs=30, logger=False, enable_checkpointing=False,
                       accelerator='gpu' if torch.cuda.is_available() else 'cpu', devices=1)
    trainer.fit(model_e, dl_tr, dl_va)
    preds_e = trainer.predict(model_e, dl_va)
    preds_e = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_e]).cpu().numpy()
    oof_emb[va] = preds_e
    # test
    test_dl_e = DataLoader(TensorDataset(torch.Tensor(X_emb_te)), batch_size=64)
    preds_te_e = trainer.predict(model_e, test_dl_e)
    preds_te_e = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_te_e]).cpu().numpy()
    pred_emb_test += preds_te_e

    # Lightning MLP on fingerprints
    ds_trf = TensorDataset(torch.Tensor(ftr), torch.Tensor(ytr))
    ds_vaf = TensorDataset(torch.Tensor(fva), torch.Tensor(yva))
    dl_trf = DataLoader(ds_trf, batch_size=64, shuffle=True)
    dl_vaf = DataLoader(ds_vaf, batch_size=64)
    model_f = LitMLP(X_fp.shape[1])
    trainer.fit(model_f, dl_trf, dl_vaf)
    preds_f = trainer.predict(model_f, dl_vaf)
    preds_f = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_f]).cpu().numpy()
    oof_fp[va] = preds_f
    # test
    test_dl_f = DataLoader(TensorDataset(torch.Tensor(X_fp_te)), batch_size=64)
    preds_te_f = trainer.predict(model_f, test_dl_f)
    preds_te_f = torch.cat([p if isinstance(p, torch.Tensor) else torch.tensor(p) for p in preds_te_f]).cpu().numpy()
    pred_fp_test += preds_te_f

    print(f"Fold {i} RMSE desc={np.sqrt(mean_squared_error(yva,oof_desc[va])):.4f} "
          f"emb={np.sqrt(mean_squared_error(yva,oof_emb[va])):.4f} "
          f"fp={np.sqrt(mean_squared_error(yva,oof_fp[va])):.4f}")

# average test preds
pred_desc_test /= folds.get_n_splits()
pred_emb_test  /= folds.get_n_splits()
pred_fp_test   /= folds.get_n_splits()
print("OOF RMSE base:",
      np.sqrt(mean_squared_error(y, oof_desc)),
      np.sqrt(mean_squared_error(y, oof_emb)),
      np.sqrt(mean_squared_error(y, oof_fp)))

In [None]:
# ========== 6. Стэкинг ==========
X_stack_tr = np.vstack([oof_desc, oof_emb, oof_fp]).T
X_stack_te = np.vstack([pred_desc_test, pred_emb_test, pred_fp_test]).T
meta = Ridge()
meta.fit(X_stack_tr, y)
oof_meta = meta.predict(X_stack_tr)
print("OOF RMSE meta:", np.sqrt(mean_squared_error(y, oof_meta)))

# Финальный сабмит
submission = pd.DataFrame({'ID': df_test['ID'], 'LogP': meta.predict(X_stack_te)})
submission.to_csv('submission_stacked.csv', index=False)
print("Готов файл submission_stacked.csv")

In [None]:
# 1. Очищенные исходные данные
df_train.to_csv("train_clean.csv", index=False)
df_test .to_csv("test_clean.csv",  index=False)

# 2. Отфильтрованные табличные дескрипторы
X_filt_df = pd.DataFrame(X_filt, columns=X_filt.columns)
X_filt_df.to_csv("X_filtered_descriptors.csv", index=False)

# 3. Препроцессированные (импутация + масштабирование)
X_desc_df = pd.DataFrame(X_desc, columns=X_filt.columns)
X_desc_df.to_csv("X_desc_scaled.csv", index=False)

# 4. MoLeR‑эмбеддинги и фингерпринты
np.save("X_emb.npy",    X_emb)
np.save("X_emb_te.npy", X_emb_te)
np.save("X_fp.npy",     X_fp)
np.save("X_fp_te.npy",  X_fp_te)