In [2]:
# Загружаем активные молекулы (pIC50 ≥ 6.0)
import pandas as pd
import numpy as np
active_df = pd.read_csv("df_cleaned_rdkit_morgan.csv")

# Удалим нули и отрицательные значения, чтобы избежать log(0) или log(отрицат.)
active_df = active_df[active_df['IC50_nM'] > 0].copy()

# Переводим IC50 из нМ в моль/л, затем берём -log10
active_df['pIC50'] = -np.log10(active_df['IC50_nM'] * 1e-9)

actives = active_df[active_df["pIC50"] >= 6]["canonical_smiles"].drop_duplicates()
actives.to_csv("fine_tune_set.smi", index=False, header=False)


In [8]:
import os
os.chdir("REINVENT4")
!python -m reinvent --config configs/my_config.toml

FileNotFoundError: [WinError 2] Не удается найти указанный файл: 'REINVENT4'

In [None]:
!python generate.py --model reinvent_model_finetuned --num 3000 --output gen_raw.smi

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

def validate_smiles(smiles_list):
    valids = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            valids.append(smi)
    return list(set(valids))  # удалить дубликаты

gen_smiles = open("gen_raw.smi").read().splitlines()
valid_smiles = validate_smiles(gen_smiles)
pd.DataFrame({"SMILES": valid_smiles}).to_csv("gen_valid.csv", index=False)


In [None]:
from joblib import load
model = load("rf_model.pkl")
scaler = load("scaler.pkl")

# Фингерпринты
from rdkit.Chem import AllChem
import numpy as np

def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return np.array(fp)

X = np.array([featurize(s) for s in valid_smiles])
X_scaled = scaler.transform(X)
pred_pIC50 = model.predict(X_scaled)


In [None]:
from rdkit.Chem import QED, Crippen, Lipinski
from sascorer import calculateScore
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from toxalerts import check_toxic_alerts  # реализовать через SMARTS или BRENK

rows = []
for i, smi in enumerate(valid_smiles):
    mol = Chem.MolFromSmiles(smi)
    try:
        qed = QED.qed(mol)
        sa = calculateScore(mol)
        tox = check_toxic_alerts(mol)  # 0 = нет токсичности
        mw = Descriptors.MolWt(mol)
        logp = Crippen.MolLogP(mol)
        h_don = Lipinski.NumHDonors(mol)
        h_acc = Lipinski.NumHAcceptors(mol)
        violations = sum([
            mw > 500, logp > 5, h_don > 5, h_acc > 10
        ])
        pred = pred_pIC50[i]

        # фильтр
        if pred > 6.0 and qed >= 0.7 and 2 < sa < 6 and tox == 0 and violations <= 1:
            final_score = 0.4 * (pred/10) + 0.3 * qed - 0.2 * (sa/10) - 0.1 * tox
            rows.append([smi, pred, qed, sa, tox, violations, final_score])
    except:
        continue

df = pd.DataFrame(rows, columns=["SMILES", "pred_pIC50", "QED", "SA_Score", "ToxicAlert", "LipinskiViolations", "FinalScore"])


In [None]:
from rdkit import DataStructs

fps = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(s), 2) for s in df["SMILES"]]
selected = []

for i in range(len(fps)):
    mol_fp = fps[i]
    is_similar = False
    for sel_fp in selected:
        sim = DataStructs.TanimotoSimilarity(mol_fp, sel_fp)
        if sim > 0.7:
            is_similar = True
            break
    if not is_similar:
        selected.append(fps[i])

final_df = df.iloc[:len(selected)].sort_values("FinalScore", ascending=False).head(10)


In [None]:
final_df.to_csv("selected_hits.csv", index=False)

from rdkit.Chem import Draw
mols = [Chem.MolFromSmiles(smi) for smi in final_df["SMILES"]]
legends = [f"pIC₅₀={p:.2f}" for p in final_df["pred_pIC50"]]
Draw.MolsToGridImage(mols, molsPerRow=5, legends=legends)
