In [1]:
!pip install rdkit




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
import typing
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Descriptors
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import FunctionTransformer

In [82]:
data = pd.read_excel('data/All-1614.xlsx')
data.head()

Unnamed: 0,Title,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,Molecular weight,Hydrogen bond acceptors,Hydrogen bond donors,Polar SA,SMILES,Pictures
0,1007-Ya-213,2.7,500.0,185.185185,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,50.0
1,1007-Ya-213,0.7,447.0,638.571429,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,51.0
2,1008-Ya-187,9.9,144.0,14.545455,250.431,1,0,15.6,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,52.0
3,1009-As-106,8.3,500.0,60.240964,222.377,1,0,15.6,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,53.0
4,1010-Ya-208,39.4,143.0,3.629442,239.361,2,0,29.54,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,54.0


In [83]:
data.rename(columns={'IC50, mmg/ml': 'IC50', 'CC50-MDCK, mmg/ml': 'CC50'}, inplace=True)

In [48]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)


# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
1,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
2,18,0,2,2,0,2,5,104,0,0,2,250.43,3.6154,79.319,15.6
3,16,0,2,2,0,2,3,92,0,0,2,222.376,2.8352,70.085,15.6
4,17,0,3,3,0,3,3,98,0,0,2,239.359,2.306,67.663,29.54


In [49]:
def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [84]:
from dataclasses import dataclass


@dataclass
class SMILESDescriptor:
    descriptor: typing.Callable
    title: typing.Optional[str] = None

In [85]:
class SMILESMolDataset(Dataset):
    def __init__(
            self,
            *,
            data: pd.DataFrame,
            x_columns: typing.List[str],
            y_columns: typing.List[str],
            smiles_descriptors: typing.Iterable[SMILESDescriptor] = None
    ):
        self.data = data
        self.x_columns = x_columns
        self.y_columns = y_columns
        self.smiles_descriptors = smiles_descriptors

        self._descript_smiles()

    def _descript_smiles(self):
        if self.smiles_descriptors:
            for d in self.smiles_descriptors:
                d_data = d.descriptor(self.data['SMILES'])
                self.data = self.data.join(d_data, lsuffix=d.title or '')

    def columns(self):
        return self.data.columns

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        x = self.data[self.x_columns].loc[idx].values
        y = self.data[self.y_columns].loc[idx].values
        return x, y

In [86]:
dataset = SMILESMolDataset(
    data=data,
    x_columns=['Title', 'SMILES', 'SI'],
    y_columns=['CC50'],
    smiles_descriptors=[
        SMILESDescriptor(descriptors_transformer.transform),
        SMILESDescriptor(rdkit_fp, 'rdkit_fp'),
        SMILESDescriptor(rdkit_2d, 'rdkit_2d')
    ]
)

In [88]:
dataset[1]

(array(['1007-Ya-213', 'OCC\\N=C(\\[C@]12C)C[C@@H](C1(C)C)CC2',
        638.5714285714286], dtype=object),
 array([447.]))