In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Информация о связывании медицинского радионуклида различными молекулами.csv')

# Base features

In [None]:
df['mol'] = df['smiles'].apply(lambda x: Chem.AddHs(Chem.MolFromSmiles(x)))
df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
df['mol_w'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
df['num_valence_electrons'] = df['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
df['num_heteroatoms'] = df['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
df['num_rings'] = df['mol'].apply(lambda x: Descriptors.RingCount(x))
df['Fp_Density_Morgan1'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan1(x))
df['Fp_Density_Morgan2'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan2(x))
df['Fp_Density_Morgan3'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan3(x))

In [None]:
symbols = ["C", "O", "N", "P", "S"]

In [None]:
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

In [None]:
number_of_atoms(symbols, df)

In [None]:
df

Unnamed: 0,id,smiles,lgK,mol,num_of_atoms,num_of_heavy_atoms,tpsa,mol_w,num_valence_electrons,num_heteroatoms,num_rings,Fp_Density_Morgan1,Fp_Density_Morgan2,Fp_Density_Morgan3,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_P_atoms,num_of_S_atoms
0,0,CC(C)(O)C(=O)O,3.41,<rdkit.Chem.rdchem.Mol object at 0x7efb1dea62d0>,15,7,57.53,104.047344,42,3,0,2.000000,2.714286,2.857143,4,3,0,0,0
1,1,CCC(O)(CC)C(=O)O,3.25,<rdkit.Chem.rdchem.Mol object at 0x7efb1dea6340>,21,9,57.53,132.078644,54,3,0,1.888889,2.555556,2.888889,6,3,0,0,0
2,2,O=C(O)C1(O)CCCC1,3.22,<rdkit.Chem.rdchem.Mol object at 0x7efb1dea63b0>,19,9,57.53,130.062994,52,3,1,1.666667,2.333333,2.777778,6,3,0,0,0
3,3,O=C(O)C1(O)CCCCC1,2.78,<rdkit.Chem.rdchem.Mol object at 0x7efb1dea6420>,22,10,57.53,144.078644,58,3,1,1.500000,2.200000,2.700000,7,3,0,0,0
4,4,O=C(O)C(O)c1ccccc1,2.82,<rdkit.Chem.rdchem.Mol object at 0x7efb1dea6490>,19,11,57.53,152.047344,58,3,1,1.636364,2.363636,2.909091,8,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,244,O=C(O)c1cccc([N+](=O)[O-])c1,1.64,<rdkit.Chem.rdchem.Mol object at 0x7efb1dee4dd0>,17,12,80.44,167.021858,62,5,1,1.750000,2.500000,3.083333,7,4,1,0,0
243,245,CCC(O)C(=O)O,3.13,<rdkit.Chem.rdchem.Mol object at 0x7efb1dee4e40>,15,7,57.53,104.047344,42,3,0,2.571429,3.428571,3.857143,4,3,0,0,0
244,246,CCCC(O)C(=O)O,2.76,<rdkit.Chem.rdchem.Mol object at 0x7efb1dee4eb0>,18,8,57.53,118.062994,48,3,0,2.375000,3.250000,3.750000,5,3,0,0,0
245,247,CC(C)C(O)C(=O)O,2.99,<rdkit.Chem.rdchem.Mol object at 0x7efb1dee4f20>,18,8,57.53,118.062994,48,3,0,2.000000,2.750000,3.125000,5,3,0,0,0


# Descriptors (rdkit)

In [None]:
df['mol'] = df['smiles'].apply(lambda x: Chem.AddHs(Chem.MolFromSmiles(x)))

In [None]:
def get_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        mol=Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names

Mol_descriptors,desc_names = get_descriptors(df['smiles'])

In [None]:
df

Unnamed: 0,id,smiles,lgK
0,0,CC(C)(O)C(=O)O,3.41
1,1,CCC(O)(CC)C(=O)O,3.25
2,2,O=C(O)C1(O)CCCC1,3.22
3,3,O=C(O)C1(O)CCCCC1,2.78
4,4,O=C(O)C(O)c1ccccc1,2.82
...,...,...,...
242,244,O=C(O)c1cccc([N+](=O)[O-])c1,1.64
243,245,CCC(O)C(=O)O,3.13
244,246,CCCC(O)C(=O)O,2.76
245,247,CC(C)C(O)C(=O)O,2.99


In [None]:
df1 = pd.DataFrame(Mol_descriptors, columns=desc_names)
df1

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.983611,10.983611,1.960648,-3.456597,0.485716,30.857143,104.105,96.041,104.047344,42,...,0,0,0,0,0,0,0,0,0,0
1,11.616944,11.616944,2.291898,-4.029514,0.591504,36.444444,132.159,120.063,132.078644,54,...,0,0,0,0,0,0,0,0,0,0
2,11.505833,11.505833,2.025231,-3.576505,0.538992,62.222222,130.143,120.063,130.062994,52,...,0,0,0,0,0,0,0,0,0,0
3,11.747188,11.747188,2.193750,-3.876505,0.571016,66.400000,144.170,132.074,144.078644,58,...,0,0,0,0,0,0,0,0,0,0
4,11.298995,11.298995,0.717295,-3.032523,0.658691,25.090909,152.149,144.085,152.047344,58,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,11.076557,11.076557,0.844583,-1.447593,0.531152,14.833333,167.120,162.080,167.021858,62,...,0,0,0,0,0,0,0,0,0,0
243,10.825903,10.825903,1.917940,-3.472338,0.511508,37.714286,104.105,96.041,104.047344,42,...,0,0,0,0,0,0,0,0,0,0
244,11.052518,11.052518,2.045718,-3.758796,0.557411,40.000000,118.132,108.052,118.062994,48,...,0,0,0,0,0,0,0,0,0,0
245,11.142569,11.142569,2.083565,-3.784838,0.539805,40.000000,118.132,108.052,118.062994,48,...,0,0,0,0,0,0,0,0,0,0


# FingerPrints

In [None]:
def ecfc_molstring(molecule, radius=5, size=4096):
    arr = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(
        AllChem.GetHashedMorganFingerprint(molecule, radius, size, useFeatures=False),
        arr,
    )
    return arr

In [None]:
features_test = pd.DataFrame(np.array([ecfc_molstring(m) for m in df['mol']]), index=None)
features_test.columns = features_test.columns.astype(str)

# Descriptors (mordred)

In [None]:
!pip install mordred

In [None]:
from mordred import Calculator, descriptors

In [None]:
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    df = calc.pandas(mols)
    return df

In [None]:
df2 = All_Mordred_descriptors(df['smiles'])

 68%|██████▊   | 168/247 [00:39<00:19,  4.02it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 247/247 [00:51<00:00,  4.83it/s]


In [None]:
df_new = pd.concat([df, df2], axis=1)

In [None]:
df_ml = df_new.drop(columns=['id', 'smiles', 'ABC', 'ABCGG'])

In [None]:
df_ml

Unnamed: 0,lgK,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,3.41,1,0,6.602720,2.175328,4.350655,6.602720,0.943246,2.792781,2.453689,...,8.466531,34.219913,104.047344,6.936490,42,6,30.0,30.0,5.173611,1.500000
1,3.25,1,0,10.062904,2.263821,4.527642,10.062904,1.118100,3.040971,2.698650,...,8.867427,37.691952,132.078644,6.289459,86,12,38.0,42.0,5.673611,2.250000
2,3.22,1,0,10.622595,2.345150,4.579023,10.622595,1.180288,3.114136,2.795083,...,9.026297,51.977986,130.062994,6.845421,81,10,44.0,50.0,4.173611,2.000000
3,2.78,1,0,11.869769,2.326846,4.653693,11.869769,1.186977,3.207159,2.928474,...,9.163354,39.932155,144.078644,6.549029,111,13,48.0,54.0,4.423611,2.250000
4,2.82,1,0,13.619695,2.250875,4.501749,13.619695,1.238154,3.286693,3.095000,...,8.906935,40.567492,152.047344,8.002492,156,13,50.0,55.0,4.583333,2.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,1.64,1,0,14.379234,2.288246,4.576491,14.379234,1.198270,3.375790,3.213997,...,9.089866,42.251151,167.021858,9.824815,197,15,56.0,62.0,5.444444,2.722222
243,3.13,1,0,7.662988,2.052881,4.105762,7.662988,1.094713,2.766317,2.459832,...,7.890957,32.688753,104.047344,6.936490,46,6,26.0,26.0,4.472222,1.777778
244,2.76,1,0,8.647053,2.074313,4.148627,8.647053,1.080882,2.900310,2.591795,...,8.011355,34.323564,118.062994,6.559055,70,7,30.0,30.0,4.722222,2.027778
245,2.99,1,0,8.424292,2.135779,4.271558,8.424292,1.053037,2.911963,2.618036,...,8.297793,35.071670,118.062994,6.559055,65,8,32.0,33.0,5.333333,1.888889


# Dataset for model

In [None]:
df_new = pd.concat([df, df1], axis=1)

In [None]:
col_to_drop = ['id', 'mol', 'Ipc', 'smiles', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
               'BCUT2D_LOGPHI','BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW']

In [None]:
df_ml = df_new.drop(columns=col_to_drop)
df_ml = df_ml.drop(index=47)

In [None]:
df_ml = df.drop(columns=['smiles', 'id', 'mol'])

# Catboost

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
import math

In [None]:
X = df_ml.drop(columns=['lgK'])
y = df_ml['lgK']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mse = math.sqrt(mean_squared_error(y_test, prediction))
    return round(mse,5)

In [None]:
cb = CatBoostRegressor(depth=None, iterations=1000, learning_rate=0.01, grow_policy='Lossguide',
                        min_data_in_leaf=None, loss_function='RMSE', verbose=False)

In [None]:
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7efae8d204c0>

In [None]:
evaluation(cb, X_test, y_test)

3.36221

# RandomForest

In [None]:
rf = RandomForestRegressor(n_estimators=1500, max_depth=None, max_features='log2', n_jobs=-1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
evaluation(rf, X_test, y_test)

2.81541

# Regression

In [None]:
lr = Ridge(alpha=0.7, max_iter=5000, solver='sag')

In [None]:
lr.fit(X_train, y_train)

In [None]:
evaluation(lr, X_test, y_test)

3.71993

# XGBoost

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(verbosity=0, eta=0.1, grow_policy='lossguide', eval_metric='rmse', n_estimators=800)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
evaluation(xgb, X_test, y_test)

3.50112

# Stacking

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

In [None]:
estimators_small = [
              ('Catboost', cb),
              ('RandomForest', rf),
              ]

In [None]:
estimators_big = [
              ('Catboost', cb),
              ('RandomForest', rf),
              ('Ridge', lr),
              ('XGBoost', xgb),
              ]

In [None]:
stacking_regressor_small = StackingRegressor(estimators=estimators_small, final_estimator=RidgeCV(), n_jobs=-1, cv=4)

In [None]:
stacking_regressor_small.fit(X, y)

In [None]:
stacking_regressor_big = StackingRegressor(estimators=estimators_big, final_estimator=RidgeCV(), n_jobs=-1, cv=4)

In [None]:
stacking_regressor_big.fit(X, y)

In [None]:
from joblib import dump, load

In [None]:
dump(stacking_regressor_small, 'model_2.joblib')

['model_2.joblib']

In [None]:
dump(stacking_regressor_big, 'model_4.joblib')

['model_4.joblib']