In [1]:
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
import numpy as np
import pandas as pd
from rdkit.Chem import Descriptors, MolFromSmiles
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from imblearn.over_sampling import SMOTE

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


The function of physico-chemical descriptors RDKit

In [3]:
def get_rdkit(df):

    '''The function accepts a dataframe as input. The Smiles contained in the "Drug"
       column are converted to the Mol format, for which all possible descriptors contained
       in the rdkit library are calculated. At the output, we get the original dataframe containing
       new columns containing calculated physico-chemical descriptors of rdkit.'''
    try:
        computed_descriptors = Chem.Descriptors.descList
        for descriptor in computed_descriptors:
            name = descriptor[0]
            df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
        return df

    except Exception as e:
        print("Ошибка:", type(e).__name__)

In [17]:
# Example data
example_data = {'Drug': ['CCC', 'COc1ccccc1']}
df_example = pd.DataFrame(example_data)
get_rdkit(df_example)

  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[1](MolFromSmiles(x)))
  df[name] = df["Drug"].apply(lambda x: descriptor[

Unnamed: 0,Drug,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CCC,2.125,2.125,1.25,1.25,0.385471,6.0,44.097,36.033,44.0626,...,0,0,0,0,0,0,0,0,0,0
1,COc1ccccc1,4.914167,4.914167,0.909722,0.909722,0.531625,8.625,108.14,100.076,108.057515,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Example data
example_data_er1 = {'Drug': ['CCC_', 'COc1ccccc1']}
df_example_er1 = pd.DataFrame(example_data_er1)
get_rdkit(df_example_er1)

Ошибка: AttributeError


[17:11:33] SMILES Parse Error: syntax error while parsing: CCC_
[17:11:33] SMILES Parse Error: Failed parsing SMILES 'CCC_' for input: 'CCC_'


In [19]:
# Example data
example_data_er2 = {'Drug': ['1', '2']}
df_example_er2 = pd.DataFrame(example_data_er2)
get_rdkit(df_example_er2)

Ошибка: AttributeError


[17:11:35] SMILES Parse Error: syntax error while parsing: 1
[17:11:35] SMILES Parse Error: Failed parsing SMILES '1' for input: '1'


In [20]:
# Example data
example_data_er3 = {'Dru': ['CCC', 'COc1ccccc1']}
df_example_er3 = pd.DataFrame(example_data_er3)
get_rdkit(df_example_er3)

Ошибка: KeyError


Functions that check the function of physico-chemical descriptors RDKit


In [21]:
def test_columns(df):
    expected_columns = [x[0] for x in Chem.Descriptors.descList]
    df_rdkit=get_rdkit(df)
    df_without_drug=df_rdkit.drop(['Drug'], axis=1)
    actual_columns = list(df_without_drug.columns)

    assert set(expected_columns) == set(actual_columns), \
           f'Missing or extra columns: Expected={expected_columns}, Actual={actual_columns}'

def check_data_type(df):
    incorrect_message = "incorrect data type"
    get_rdkit(df)
    df=df.drop(['Drug'], axis=1)
    for column in df.columns:
        data_type = df[column].dtype
        if data_type != 'float64' and data_type != 'int64':
            print(incorrect_message)
            break

def test_rows(df):
    result_df = get_rdkit(df)

    assert len(df) == len(result_df), \
           f'Number of rows mismatch: Expected={len(df)}, Actual={len(result_df)}'

def main(df):
    test_columns(df)
    check_data_type(df)
    test_rows(df)

if __name__ == '__main__':
    main(df_example)



The function of morgan fingerprint

In [22]:
def get_morgan(df, radius=2, nBits=1024):
    try:
        df['Morgan'] = df['Drug'].apply(lambda x:  AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=radius, nBits=nBits))
        for i in range(nBits):
            df[f'Bit_{i}'] =  df['Morgan'][0][i]
            df_new=df.drop(['Drug', 'Morgan'], axis=1)
        return df_new

    except Exception as e:
        print("Ошибка:", type(e).__name__)

In [23]:
# Example data
example_data = {'Drug': ['CCC', 'COc1ccccc1']}
df_example = pd.DataFrame(example_data)
get_morgan(df_example)

  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['Morgan'][0][i]
  df[f'Bit_{i}'] =  df['M

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Example data
example_data_er1 = {'Drug': ['CCC_', 'COc1ccccc1']}
df_example_er1 = pd.DataFrame(example_data_er1)
get_morgan(df_example_er1)

Ошибка: ArgumentError


[17:12:36] SMILES Parse Error: syntax error while parsing: CCC_
[17:12:36] SMILES Parse Error: Failed parsing SMILES 'CCC_' for input: 'CCC_'


In [25]:
# Example data
example_data_er2 = {'Dru': ['CCC', 'COc1ccccc1']}
df_example_er2 = pd.DataFrame(example_data_er2)
get_morgan(df_example_er2)

Ошибка: KeyError


Functions that check the function of morgan fingerprint

In [26]:
def test_columns_morgan(df):

    expected_columns = np.shape(df)[1] + 1023 # кол-во столбцов битов (1024) - столбец "Drug" (1)
    df_morgan = get_morgan(df)
    actual_columns = np.shape(df_morgan)[1]

    if expected_columns != actual_columns:
            print('incorrect number of columns')

def check_data_type_morgan(df):
    incorrect_message = "incorrect data type"
    df_morgan = get_morgan(df)

    for column in df_morgan.columns:
        data_type = df_morgan[column].dtype
        if data_type != 'float64' and data_type != 'int64':
            print(incorrect_message)
            break

def test_rows_morgan(df):
    result_df = get_morgan(df)

    assert len(df) == len(result_df), \
           f'Number of rows mismatch: Expected={len(df)}, Actual={len(result_df)}'

def main_morgan(df):
    test_columns_morgan(df)
    check_data_type_morgan(df)
    test_rows_morgan(df)

if __name__ == '__main__':
    main_morgan(df_example)



incorrect number of columns


