In [43]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# RDKit для фингерпринтов
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# sklearn для селекции и предобработки
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing     import RobustScaler, StandardScaler
from sklearn.decomposition     import PCA
from sklearn.ensemble          import RandomForestRegressor
from sklearn.inspection        import permutation_importance

# molecule-generation для эмбеддингов
from molecule_generation import load_model_from_directory

# CatBoost и метрики
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Устройство для MoLeR (если используется GPU)
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device for embeddings:", device)

2025-04-19 18:40:29.129741: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Using device for embeddings: cuda


In [60]:
# Пути
TRAIN_RAW = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/train_data_all_descriptors.csv"
TEST_RAW  = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/test_data_all_descriptors.csv"
MODEL_DIR = "/home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint"
OUTPUT    = "/home/oleg28/shteyn/vlad/SIBUR_HACK/processed"
os.makedirs(OUTPUT, exist_ok=True)

### I. Базовый EDA

1. Считаем пропуски и дубликаты по SMILES.

2. Удаляем полностью дублирующиеся молекулы.

In [52]:
df_train = pd.read_csv(TRAIN_RAW)
df_test  = pd.read_csv(TEST_RAW)

In [53]:
# Пропуски
print("Missing in train:\n", df_train.isnull().sum().loc[lambda x: x>0])
print("Missing in test:\n",  df_test .isnull().sum().loc[lambda x: x>0])

Missing in train:
 Series([], dtype: int64)
Missing in test:
 Series([], dtype: int64)


In [54]:
# Дубликаты SMILES
dups = df_train.duplicated("SMILES", keep=False).sum()
print("SMILES duplicates in train:", dups)

df_train = df_train.drop_duplicates("SMILES").reset_index(drop=True)
print("Train shape after deduplication:", df_train.shape)

SMILES duplicates in train: 3053
Train shape after deduplication: (10898, 105)


### II. Добавление Morgan‑fingerprints  
  
`Fingerprint` — это битовая маска, отражающая окружение атомов.  
Здесь используем `1024‑битный` `Morgan‑fingerprint` `(radius=2)`.

In [55]:
def smiles_to_fp(smi, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, n_bits)
    arr = np.zeros(n_bits, dtype=int)
    arr[list(fp.GetOnBits())] = 1
    return arr

In [56]:
# Генерация
fps_tr = np.vstack([smiles_to_fp(s) for s in tqdm(df_train["SMILES"], desc="Morgan train")])
fps_te = np.vstack([smiles_to_fp(s) for s in tqdm(df_test ["SMILES"], desc="Morgan test" )])

Morgan train: 100%|██████████| 10898/10898 [00:01<00:00, 8334.38it/s]
Morgan test: 100%|██████████| 2630/2630 [00:00<00:00, 10053.83it/s]


In [57]:
# DataFrames
fp_cols      = [f"FP_{i}" for i in range(fps_tr.shape[1])]
df_fp_train  = pd.DataFrame(fps_tr, columns=fp_cols)
df_fp_test   = pd.DataFrame(fps_te,  columns=fp_cols)

In [58]:
# Склейка и сохранение
train_morgan = pd.concat([df_train[["ID","LogP"]].reset_index(drop=True), df_fp_train], axis=1)
test_morgan  = pd.concat([df_test [["ID"]].reset_index(drop=True), df_fp_test ], axis=1)
train_morgan.to_csv(f"{OUTPUT}/train_morgan.csv", index=False)
test_morgan .to_csv(f"{OUTPUT}/test_morgan.csv" , index=False)
print("Morgan shapes:", train_morgan.shape, test_morgan.shape)

Morgan shapes: (10898, 1026) (2630, 1025)


### III. MoLeR Embeddings

In [64]:
print("Загружаем модель из:", MODEL_DIR)
with load_model_from_directory(MODEL_DIR, num_workers=8, beam_size=1) as model:
    smiles_list = df_train["SMILES"].tolist()
    # Генерируем эмбеддинги (возвращается List[np.ndarray])
    emb_tr = model.encode(smiles_list)
    print("Получено эмбеддингов для train:", len(emb_tr))
    smiles_list_te = df_test["SMILES"].tolist()
    emb_te = model.encode(smiles_list_te)
    print("Получено эмбеддингов для test:", len(emb_te))


Загружаем модель из: /home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint
Loading a trained model from: /home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint/GNN_Edge_MLP_MoLeR__2022-02-24_07-16-23_best.pkl


2025-04-19 18:52:00.245616: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2025-04-19 18:52:00.245649: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: DevServ
2025-04-19 18:52:00.260203: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: DevServ
2025-04-19 18:52:00.260316: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 535.183.1
2025-04-19 18:52:00.260346: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.183.1
2025-04-19 18:52:00.260351: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 535.183.1
2025-04-19 18:52:00.261004: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in perform

2025-04-19 18:52:00.320351: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2025-04-19 18:52:00.320380: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: DevServ
2025-04-19 18:52:00.320386: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: DevServ
2025-04-19 18:52:00.320464: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 535.183.1
2025-04-19 18:52:00.320489: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.183.1
2025-04-19 18:52:00.320493: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 535.183.1
2025-04-19 18:52:00.320773: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in perform

Получено эмбеддингов для train: 10898


opriate compiler flags.


Получено эмбеддингов для test: 2630


In [65]:
# Собираем массивы
emb_tr = np.vstack(emb_tr)
emb_te = np.vstack(emb_te)
print("Raw embed shapes:", emb_tr.shape, emb_te.shape)

Raw embed shapes: (10898, 512) (2630, 512)


In [66]:
# PCA => 50 компонент
pca      = PCA(n_components=128, random_state=42)
pca_tr   = pca.fit_transform(emb_tr)
pca_te   = pca.transform(emb_te)

embed_cols     = [f"embed_pca_{i}" for i in range(128)]
df_embed_train = pd.DataFrame(pca_tr, columns=embed_cols)
df_embed_test  = pd.DataFrame(pca_te, columns=embed_cols)
df_embed_train["ID"] = df_train["ID"].values
df_embed_test ["ID"] = df_test ["ID"].values

In [67]:
# Стандартизация
ss = StandardScaler().fit(df_embed_train[embed_cols])
df_embed_train[embed_cols] = ss.transform(df_embed_train[embed_cols])
df_embed_test [embed_cols]   = ss.transform(df_embed_test [embed_cols])

In [68]:
# Сохраняем
train_embed = df_embed_train[["ID"]+embed_cols]
test_embed  = df_embed_test [["ID"]+embed_cols]
train_embed.to_csv(f"{OUTPUT}/train_embed.csv", index=False)
test_embed .to_csv(f"{OUTPUT}/test_embed.csv" , index=False)
print("Embed shapes:", train_embed.shape, test_embed.shape)

Embed shapes: (10898, 129) (2630, 129)


### IV. Очистка и удаление признаков с нулевой дисперсией
  
- Убираем колонки `SMILES` и `mol` (RDKit‑объект).  
- Сохраняем `ID` и `LogP`.  
- Удаляем признаки, где вариация равна нулю.

In [69]:
# Объединяем фичи
train_full = train_morgan.merge(train_embed, on="ID")
test_full  = test_morgan .merge(test_embed,  on="ID")

In [70]:
# Zero‑variance filter
feat_cols = train_full.columns.drop(["ID","LogP"])
vt = VarianceThreshold(0.0)
vt.fit(train_full[feat_cols])

In [71]:
keep_cols    = feat_cols[vt.get_support()]
removed_zero = feat_cols[~vt.get_support()].tolist()
print("Removed zero‑var features:", removed_zero)

Removed zero‑var features: []


In [72]:
train_cl = pd.concat([train_full[["ID","LogP"]], train_full[keep_cols]], axis=1)
test_cl  = pd.concat([test_full [["ID"]]        , test_full [keep_cols]], axis=1)
print("After cleaning:", train_cl.shape, test_cl.shape)

After cleaning: (10898, 1154) (2630, 1153)


### V. Удаление сильно коррелированных признаков  
  
Убираем избыточные фичи с |corr| > 0.90.

In [73]:
corr_mat = train_cl.drop(columns=["ID","LogP"]).corr().abs()
upper    = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
to_drop  = [c for c in upper.columns if upper[c].gt(0.90).any()]
print("Dropping correlated:", len(to_drop))

Dropping correlated: 1


In [74]:
train_dc = train_cl.drop(columns=to_drop)
test_dc  = test_cl .drop(columns=to_drop)
print("After drop corr:", train_dc.shape, test_dc.shape)

After drop corr: (10898, 1153) (2630, 1152)


### VI. Лог‑преобразование и масштабирование  
  
- Выявляем фичи со skewness > 1.0.  
- Применяем `log1p` (с `clip(lower=0)`).  
- Масштабируем через `RobustScaler`.

In [75]:
# Разделяем
train_id = train_dc["ID"]
y_train  = train_dc["LogP"]
X_train  = train_dc.drop(columns=["ID","LogP"])
X_test   = test_dc .drop(columns=["ID"])

In [76]:
# Логируем сильно скошенные >1.0
skews  = X_train.skew().abs()
to_log = skews[skews > 1.0].index.tolist()
print("Log1p on:", to_log)

Log1p on: ['FP_0', 'FP_1', 'FP_2', 'FP_3', 'FP_4', 'FP_5', 'FP_6', 'FP_7', 'FP_8', 'FP_9', 'FP_10', 'FP_11', 'FP_12', 'FP_13', 'FP_14', 'FP_15', 'FP_16', 'FP_17', 'FP_18', 'FP_19', 'FP_20', 'FP_21', 'FP_22', 'FP_23', 'FP_24', 'FP_25', 'FP_26', 'FP_27', 'FP_28', 'FP_29', 'FP_30', 'FP_31', 'FP_32', 'FP_34', 'FP_35', 'FP_36', 'FP_37', 'FP_38', 'FP_39', 'FP_40', 'FP_41', 'FP_42', 'FP_43', 'FP_44', 'FP_45', 'FP_46', 'FP_47', 'FP_48', 'FP_49', 'FP_50', 'FP_51', 'FP_52', 'FP_53', 'FP_54', 'FP_55', 'FP_56', 'FP_57', 'FP_58', 'FP_59', 'FP_60', 'FP_61', 'FP_62', 'FP_63', 'FP_65', 'FP_66', 'FP_67', 'FP_68', 'FP_69', 'FP_70', 'FP_71', 'FP_72', 'FP_73', 'FP_74', 'FP_75', 'FP_76', 'FP_77', 'FP_78', 'FP_79', 'FP_81', 'FP_82', 'FP_83', 'FP_84', 'FP_85', 'FP_86', 'FP_87', 'FP_88', 'FP_89', 'FP_90', 'FP_91', 'FP_92', 'FP_93', 'FP_94', 'FP_95', 'FP_96', 'FP_97', 'FP_98', 'FP_99', 'FP_100', 'FP_101', 'FP_102', 'FP_103', 'FP_104', 'FP_105', 'FP_106', 'FP_107', 'FP_108', 'FP_109', 'FP_110', 'FP_111', 'FP_11

In [77]:
for c in to_log:
    X_train[c] = np.log1p(X_train[c].clip(lower=0))
    X_test [c] = np.log1p(X_test [c].clip(lower=0))

In [78]:
# RobustScaler
rs = RobustScaler().fit(X_train)
X_train_s = pd.DataFrame(rs.transform(X_train), columns=X_train.columns)
X_test_s  = pd.DataFrame(rs.transform(X_test),  columns=X_test.columns)

In [79]:
train_pre = pd.concat([train_id, X_train_s, y_train], axis=1)
test_pre  = pd.concat([test_dc[["ID"]], X_test_s], axis=1)
print("After log+scale:", train_pre.shape, test_pre.shape)

After log+scale: (10898, 1153) (2630, 1152)


### VII. Анализ важности признаков и отбор  
  
1. Обучим RandomForest на полном наборе.

2. Вычислим Permutation Importance (чтобы увидеть не только встроенную «важность» дерева, но и насколько падёт качество при «перемешивании» каждого признака).

3. Выберем порог:

Уберём фичи с отрицательной или близкой к нулю важностью.

Оставим, скажем, топ‑100 или все > 0.001 по mean importance.

In [80]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

In [81]:
X = train_pre.drop(columns=["ID","LogP"])
y = train_pre["LogP"]

In [82]:
# Обучаем RF (быстрее — меньше деревьев, но глубже)
rf = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1)
rf.fit(X, y)

In [83]:
perm = permutation_importance(rf, X, y, n_repeats=10, random_state=42, n_jobs=-1)
imp = pd.DataFrame({"feature":X.columns, "mean_imp":perm.importances_mean})
imp = imp.sort_values("mean_imp", ascending=False).reset_index(drop=True)
print("Top 20 features:\n", imp.head(20))

Top 20 features:
          feature  mean_imp
0    embed_pca_7  0.370756
1    embed_pca_2  0.274023
2    embed_pca_1  0.151682
3         FP_650  0.131520
4   embed_pca_12  0.113362
5    embed_pca_6  0.087028
6    embed_pca_5  0.066134
7         FP_659  0.026896
8    embed_pca_4  0.023775
9         FP_561  0.023475
10        FP_942  0.014005
11        FP_366  0.010766
12  embed_pca_65  0.010764
13        FP_875  0.010639
14   embed_pca_0  0.009168
15        FP_356  0.008235
16   embed_pca_8  0.007946
17  embed_pca_68  0.006690
18        FP_726  0.006380
19        FP_171  0.006244


In [84]:
selected = imp.loc[imp.mean_imp>0.001, "feature"].tolist()
print(f"Selecting {len(selected)}/{X.shape[1]} features")

Selecting 150/1151 features


In [85]:
train_final = pd.concat([train_pre[["ID","LogP"]], X[selected]], axis=1)
test_final  = pd.concat([test_pre [["ID"]],        test_pre [selected]], axis=1)

# Сохраняем финальные таблицы
train_final.to_csv(f"{OUTPUT}/train_final.csv", index=False)
test_final .to_csv(f"{OUTPUT}/test_final.csv" , index=False)
print("Final shapes:", train_final.shape, test_final.shape)

Final shapes: (10898, 152) (2630, 151)


### VII. Обучение CatBoost и оценка RMSE

In [86]:
X_tr, X_val, y_tr, y_val = train_test_split(
    train_final.drop(columns=["ID","LogP"]),
    train_final["LogP"],
    test_size=0.2, random_state=42
)

In [87]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric="RMSE",
    random_seed=42,
    verbose=False
)
model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

<catboost.core.CatBoostRegressor at 0x7c9cd1267d90>

In [88]:
y_pred = model.predict(X_val)
rmse   = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 1.4136
