In [58]:
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

In [None]:
from transformers import pipeline, Aut
from rdkit import Chem
from rdkit.Chem.rdchem import Mol
from rdkit.Chem import Descriptors

from tqdm.auto import tqdm

In [69]:
pipe = pipeline("text-generation", model="msb-roshan/molgpt")

Device set to use cpu


In [70]:
s = []
for i in tqdm(range(100)):
    s.append(pipe('', num_workers=12))

  0%|          | 0/100 [00:00<?, ?it/s]

In [71]:
def smiles_valid(smiles_str: str) -> bool:
    try:
        mol = Chem.MolFromSmiles(smiles_str)
    except Exception:
        return False
    
    if mol is None:
        return False

    return True

aromatic_amine_smarts = Chem.MolFromSmarts('[N,n;H1,H0;$(N-[a])]-[a]') # этот шаблон ищет ароматические амины, в которых азот связан с ароматическим кольцом
phenol_smarts = Chem.MolFromSmarts('[OH]-[c,C]1:[c,C]:[c,C]:[c,C]:[c,C]:[c,C]:1') # этот шаблон ищет фенолы, в которых гидроксильная группа связана с ароматическим кольцом


def is_aromatic_amine_or_phenol(smiles_str: str) -> bool:
    mol = Chem.MolFromSmiles(smiles_str)
    return mol.HasSubstructMatch(phenol_smarts) or \
           mol.HasSubstructMatch(aromatic_amine_smarts)


def has_radical(mol: Mol) -> bool:
    for atom in mol.GetAtoms():
        if atom.GetNumRadicalElectrons() > 0:
            return True
    return False


def neutral_and_no_radical(smiles_str: str) -> bool:
    mol = Chem.MolFromSmiles(smiles_str)
    return Chem.GetFormalCharge(mol) == 0 and not has_radical(mol)

def mol_weight(smiles_str: str) -> float:
    mol = Chem.MolFromSmiles(smiles_str)
    return Descriptors.MolWt(mol)

allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}

def only_allowed_atoms(smiles_str: str) -> bool:
    mol = Chem.MolFromSmiles(smiles_str)
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in allowed_atoms:
            return False
    return True

def log_p(smiles_str: str) -> float:
    mol = Chem.MolFromSmiles(smiles_str)
    logp = Descriptors.MolLogP(mol)
    return logp

def sa_score(smiles_str) -> float:
    mol = Chem.MolFromSmiles(smiles_str)
    return sascorer.calculateScore(mol)

In [75]:
import pandas as pd

rows = []
for el in s:
    generated_text = el[0]['generated_text']
    valid = smiles_valid(generated_text)
    
    row_data = {
        'generated_text': generated_text,
        'valid': valid
    }
    
    if valid:
        row_data.update({
            'sa_score': sa_score(generated_text),
            'log_p': log_p(generated_text),
            'only_allowed_atoms': only_allowed_atoms(generated_text),
            'mol_weight': mol_weight(generated_text),
            'neutral_and_no_radical': neutral_and_no_radical(generated_text),
            'is_aromatic_amine_or_phenol': is_aromatic_amine_or_phenol(generated_text)
        })
    
    rows.append(row_data)

# Создаем DataFrame и явно задаем порядок колонок
df = pd.DataFrame(rows, columns=[
    'generated_text',
    'valid',
    'sa_score',
    'log_p',
    'only_allowed_atoms',
    'mol_weight',
    'neutral_and_no_radical',
    'is_aromatic_amine_or_phenol'
])

# Для невалидных SMILES заполняем NaN
df = df.where(df['valid'], pd.NA)

[19:27:52] SMILES Parse Error: extra open parentheses while parsing: C(=O)NC(C
[19:27:52] SMILES Parse Error: check for mistakes around position 8:
[19:27:52] C(=O)NC(C
[19:27:52] ~~~~~~~^
[19:27:52] SMILES Parse Error: Failed parsing SMILES 'C(=O)NC(C' for input: 'C(=O)NC(C'
[19:27:52] SMILES Parse Error: unclosed ring for input: 'CC1CN'
[19:27:52] SMILES Parse Error: unclosed ring for input: 'CCC(=O)O.CCC1'
[19:27:52] SMILES Parse Error: unclosed ring for input: 'C(=O)NC1'
[19:27:52] SMILES Parse Error: syntax error while parsing: CC(N)=S.N#
[19:27:52] SMILES Parse Error: check for mistakes around position 10:
[19:27:52] CC(N)=S.N#
[19:27:52] ~~~~~~~~~^
[19:27:52] SMILES Parse Error: Failed parsing SMILES 'CC(N)=S.N#' for input: 'CC(N)=S.N#'
[19:27:52] SMILES Parse Error: unclosed ring for input: 'C(=O)NCC1'
[19:27:52] SMILES Parse Error: extra open parentheses while parsing: c1nc(C
[19:27:52] SMILES Parse Error: check for mistakes around position 5:
[19:27:52] c1nc(C
[19:27:52] ~~~~

In [77]:
df['valid'].fillna(False, inplace=True)

In [79]:
df['valid'].astype('int').mean()

np.float64(0.76)

In [86]:
print('\n\n\n'.join(df['generated_text'][df['generated_text'].notna()].tolist()))

CC(N)=S.CCC(C) c1ccccc1.O=C1 CCC c2n c3cc(C#C c4ccccn4)ccc3 c(=O)n2 CC1.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1 CC1 CCCN C21.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1 CCC1 CCCN C21.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1 CCC1 CCCN C21.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1 CCC1 N CCC21.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1 CCC1 NCCN C21.O=c1 c2ccc(C#C c3ccccn3)cc2 nc2n1


C(=O)c1ccccc1 Br.CCC (C(=O) C1CCCCC1) C(=O)c1ccccc1 OC1CCCCO1.COC(=O) CC(=O)c1ccccc1 OC1CCCCO1.COC(=O) CC(=O)c1ccccc1 OC1CCCCO1.COC(=O) Cc1ccccc1 OC1CCCCO1.O=C(C (C(=O)c1ccccc1) =C([O-]) c1ccccc1)c1ccccc1 OC1CCCCO1.O=C1 CCCN1 c1ccccc1 OC1CCCCO1.O=C1 CCCN1 c1ccccc1 OC1CCCCO1.O=C1 CCCN1 c1ccccc1 OC1CCCCO1.[Br-].[ M g+2].[ Z n]. c1ccoc1.c1ccc(P(c2ccccc2)c2ccccc2)cc1.c1ccc2 [nH+] c[nH]c2c1.c1ccc2 [nH+] c[nH]c2c1.c1ccc2 [nH+]


CC(=O)OC (C=C) C1C OC(C)(C)O1.CC(C)C O c1ccc(C(=O) OC2 COC(=O)C(C 3 (O) CC3) CC(OC(=O) /C=C/ c3ccc(O)cc3) CC(=O)OC 3 CC(C) CCC3 C(C)C 2)cc1.CC(C)C Oc1ccc(C2 OCC(C)C O2)cc1.CC(C)C c1ccc( B2OC(C)(C)C(C)(C)O2)cc1.O CC1O C(O)C(O) C