In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# RDKit для фингерпринтов
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# sklearn для селекции и предобработки
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing     import RobustScaler, StandardScaler
from sklearn.decomposition     import PCA
from sklearn.ensemble          import RandomForestRegressor
from sklearn.inspection        import permutation_importance

# molecule-generation для эмбеддингов
from molecule_generation import load_model_from_directory

# CatBoost и метрики
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Устройство для MoLeR (если используется GPU)
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device for embeddings:", device)

In [None]:
# Пути
TRAIN_RAW = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/train_data_all_descriptors.csv"
TEST_RAW  = "/home/oleg28/shteyn/vlad/SIBUR_HACK/data/test_data_all_descriptors.csv"
MODEL_DIR = "/home/oleg28/shteyn/vlad/SIBUR_HACK/moler/molecule-generation/molecule_generation/model_checkpoint"
OUTPUT    = "/home/oleg28/shteyn/vlad/SIBUR_HACK/processed"
os.makedirs(OUTPUT, exist_ok=True)

### I. Базовый EDA

1. Считаем пропуски и дубликаты по SMILES.

2. Удаляем полностью дублирующиеся молекулы.

In [None]:
df_train = pd.read_csv(TRAIN_RAW)
df_test  = pd.read_csv(TEST_RAW)

In [None]:
# Пропуски
print("Missing in train:\n", df_train.isnull().sum().loc[lambda x: x>0])
print("Missing in test:\n",  df_test .isnull().sum().loc[lambda x: x>0])

In [None]:
# Дубликаты SMILES
dups = df_train.duplicated("SMILES", keep=False).sum()
print("SMILES duplicates in train:", dups)

df_train = df_train.drop_duplicates("SMILES").reset_index(drop=True)
print("Train shape after deduplication:", df_train.shape)

### II. Добавление Morgan‑fingerprints  
  
`Fingerprint` — это битовая маска, отражающая окружение атомов.  
Здесь используем `1024‑битный` `Morgan‑fingerprint` `(radius=2)`.

In [None]:
def smiles_to_fp(smi, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, n_bits)
    arr = np.zeros(n_bits, dtype=int)
    arr[list(fp.GetOnBits())] = 1
    return arr

In [None]:
# Генерация
fps_tr = np.vstack([smiles_to_fp(s) for s in tqdm(df_train["SMILES"], desc="Morgan train")])
fps_te = np.vstack([smiles_to_fp(s) for s in tqdm(df_test ["SMILES"], desc="Morgan test" )])

In [None]:
# DataFrames
fp_cols      = [f"FP_{i}" for i in range(fps_tr.shape[1])]
df_fp_train  = pd.DataFrame(fps_tr, columns=fp_cols)
df_fp_test   = pd.DataFrame(fps_te,  columns=fp_cols)

In [None]:
# Склейка и сохранение
train_morgan = pd.concat([df_train[["ID","LogP"]].reset_index(drop=True), df_fp_train], axis=1)
test_morgan  = pd.concat([df_test [["ID"]].reset_index(drop=True), df_fp_test ], axis=1)
train_morgan.to_csv(f"{OUTPUT}/train_morgan.csv", index=False)
test_morgan .to_csv(f"{OUTPUT}/test_morgan.csv" , index=False)
print("Morgan shapes:", train_morgan.shape, test_morgan.shape)

### III. MoLeR Embeddings

In [None]:
print("Загружаем модель из:", MODEL_DIR)
with load_model_from_directory(MODEL_DIR, num_workers=8, beam_size=1) as model:
    smiles_list = df_train["SMILES"].tolist()
    # Генерируем эмбеддинги (возвращается List[np.ndarray])
    emb_tr = model.encode(smiles_list)
    print("Получено эмбеддингов для train:", len(emb_tr))
    smiles_list_te = df_test["SMILES"].tolist()
    emb_te = model.encode(smiles_list_te)
    print("Получено эмбеддингов для test:", len(emb_te))


In [None]:
# Собираем массивы
emb_tr = np.vstack(emb_tr)
emb_te = np.vstack(emb_te)
print("Raw embed shapes:", emb_tr.shape, emb_te.shape)

In [None]:
# PCA => 50 компонент
pca      = PCA(n_components=128, random_state=42)
pca_tr   = pca.fit_transform(emb_tr)
pca_te   = pca.transform(emb_te)

embed_cols     = [f"embed_pca_{i}" for i in range(128)]
df_embed_train = pd.DataFrame(pca_tr, columns=embed_cols)
df_embed_test  = pd.DataFrame(pca_te, columns=embed_cols)
df_embed_train["ID"] = df_train["ID"].values
df_embed_test ["ID"] = df_test ["ID"].values

In [None]:
# Стандартизация
ss = StandardScaler().fit(df_embed_train[embed_cols])
df_embed_train[embed_cols] = ss.transform(df_embed_train[embed_cols])
df_embed_test [embed_cols]   = ss.transform(df_embed_test [embed_cols])

In [None]:
# Сохраняем
train_embed = df_embed_train[["ID"]+embed_cols]
test_embed  = df_embed_test [["ID"]+embed_cols]
train_embed.to_csv(f"{OUTPUT}/train_embed.csv", index=False)
test_embed .to_csv(f"{OUTPUT}/test_embed.csv" , index=False)
print("Embed shapes:", train_embed.shape, test_embed.shape)

### IV. Очистка и удаление признаков с нулевой дисперсией
  
- Убираем колонки `SMILES` и `mol` (RDKit‑объект).  
- Сохраняем `ID` и `LogP`.  
- Удаляем признаки, где вариация равна нулю.

In [None]:
# Объединяем фичи
train_full = train_morgan.merge(train_embed, on="ID")
test_full  = test_morgan .merge(test_embed,  on="ID")

In [None]:
# Zero‑variance filter
feat_cols = train_full.columns.drop(["ID","LogP"])
vt = VarianceThreshold(0.0)
vt.fit(train_full[feat_cols])

In [None]:
keep_cols    = feat_cols[vt.get_support()]
removed_zero = feat_cols[~vt.get_support()].tolist()
print("Removed zero‑var features:", removed_zero)

In [None]:
train_cl = pd.concat([train_full[["ID","LogP"]], train_full[keep_cols]], axis=1)
test_cl  = pd.concat([test_full [["ID"]]        , test_full [keep_cols]], axis=1)
print("After cleaning:", train_cl.shape, test_cl.shape)

### V. Удаление сильно коррелированных признаков  
  
Убираем избыточные фичи с |corr| > 0.90.

In [None]:
corr_mat = train_cl.drop(columns=["ID","LogP"]).corr().abs()
upper    = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
to_drop  = [c for c in upper.columns if upper[c].gt(0.90).any()]
print("Dropping correlated:", len(to_drop))

In [None]:
train_dc = train_cl.drop(columns=to_drop)
test_dc  = test_cl .drop(columns=to_drop)
print("After drop corr:", train_dc.shape, test_dc.shape)

### VI. Лог‑преобразование и масштабирование  
  
- Выявляем фичи со skewness > 1.0.  
- Применяем `log1p` (с `clip(lower=0)`).  
- Масштабируем через `RobustScaler`.

In [None]:
# Разделяем
train_id = train_dc["ID"]
y_train  = train_dc["LogP"]
X_train  = train_dc.drop(columns=["ID","LogP"])
X_test   = test_dc .drop(columns=["ID"])

In [None]:
# Логируем сильно скошенные >1.0
skews  = X_train.skew().abs()
to_log = skews[skews > 1.0].index.tolist()
print("Log1p on:", to_log)

In [None]:
for c in to_log:
    X_train[c] = np.log1p(X_train[c].clip(lower=0))
    X_test [c] = np.log1p(X_test [c].clip(lower=0))

In [None]:
# RobustScaler
rs = RobustScaler().fit(X_train)
X_train_s = pd.DataFrame(rs.transform(X_train), columns=X_train.columns)
X_test_s  = pd.DataFrame(rs.transform(X_test),  columns=X_test.columns)

In [None]:
train_pre = pd.concat([train_id, X_train_s, y_train], axis=1)
test_pre  = pd.concat([test_dc[["ID"]], X_test_s], axis=1)
print("After log+scale:", train_pre.shape, test_pre.shape)

### VII. Анализ важности признаков и отбор  
  
1. Обучим RandomForest на полном наборе.

2. Вычислим Permutation Importance (чтобы увидеть не только встроенную «важность» дерева, но и насколько падёт качество при «перемешивании» каждого признака).

3. Выберем порог:

Уберём фичи с отрицательной или близкой к нулю важностью.

Оставим, скажем, топ‑100 или все > 0.001 по mean importance.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

In [None]:
X = train_pre.drop(columns=["ID","LogP"])
y = train_pre["LogP"]

In [None]:
# Обучаем RF (быстрее — меньше деревьев, но глубже)
rf = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1)
rf.fit(X, y)

In [None]:
perm = permutation_importance(rf, X, y, n_repeats=10, random_state=42, n_jobs=-1)
imp = pd.DataFrame({"feature":X.columns, "mean_imp":perm.importances_mean})
imp = imp.sort_values("mean_imp", ascending=False).reset_index(drop=True)
print("Top 20 features:\n", imp.head(20))

In [None]:
selected = imp.loc[imp.mean_imp>0.001, "feature"].tolist()
print(f"Selecting {len(selected)}/{X.shape[1]} features")

In [None]:
train_final = pd.concat([train_pre[["ID","LogP"]], X[selected]], axis=1)
test_final  = pd.concat([test_pre [["ID"]],        test_pre [selected]], axis=1)

# Сохраняем финальные таблицы
train_final.to_csv(f"{OUTPUT}/train_final.csv", index=False)
test_final .to_csv(f"{OUTPUT}/test_final.csv" , index=False)
print("Final shapes:", train_final.shape, test_final.shape)

### VII. Обучение CatBoost и оценка RMSE

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(
    train_final.drop(columns=["ID","LogP"]),
    train_final["LogP"],
    test_size=0.2, random_state=42
)

In [None]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric="RMSE",
    random_seed=42,
    verbose=False
)
model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

In [None]:
y_pred = model.predict(X_val)
rmse   = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")