In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e1_01.csv")
smiles = data['SMILES']
smiles

0                            O=C1NC(=O)c2cc(-c3cncs3)ccc21
1                                  CNCc1ccc(S(C)(=O)=O)cc1
2               c1ccc2c(c1)sc1c(-c3c4ncsc4cc4ncsc34)cccc12
3        C1=Nc2c(cc3ncsc3c2-c2cccc3c2sc2ccccc23)[SH]1c1...
4        O=P(c1ccccc1)(c1ccccc1)c1cccc(-c2ccc3c(c2)oc2c...
                               ...                        
25867    Cc1cc(-c2occc2C2=CC(=O)C=CC2=O)cc(C)c1B1c2cccc...
25868    Cc1cc(-c2ccco2)cc(C)c1B1c2ccccc2B(c2c(C)ccc(-n...
25869    Cc1cccc(C)c1B1c2ccccc2B(c2c(C)ccc(-c3ccc4cccc5...
25870    Cc1cccc(C)c1B1c2ccccc2B(c2c(C)c(-c3c(F)c(F)c(F...
25871    Cc1cccc(C)c1B1c2ccccc2B(c2c(C)c(-c3c(F)c(F)c(F...
Name: SMILES, Length: 25872, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|▏                                                                            | 48/25872 [00:11<1:18:47,  5.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▏                                                                              | 67/25872 [00:12<33:41, 12.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▏                                                                              | 74/25872 [00:13<38:09, 11.27it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                              | 91/25872 [00:15<56:19,  7.63it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                              | 98/25872 [00:16<55:12,  7.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▌                                                                             | 189/25872 [00:22<25:43, 16.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 250/25872 [00:29<38:00, 11.23it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 271/25872 [00:31<45:09,  9.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 288/25872 [00:33<51:52,  8.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████| 25872/25872 [47:56<00:00,  8.99it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,12.784941,11.306906,0.0,0.0,21.099587,2.465486,4.800181,21.099587,1.318724,3.738438,...,9.754233,64.677355,230.014998,10.455227,418.0,23.0,88.0,106.0,4.666667,3.444444
1,9.607534,8.765714,0.0,1.0,15.185460,2.321935,4.643870,15.185460,1.168112,3.457903,...,9.278653,43.923001,199.066700,7.656412,262.0,16.0,62.0,68.0,5.784722,2.916667
2,20.849242,16.631104,0.0,0.0,34.706954,2.575194,5.044330,34.706954,1.388278,4.221171,...,10.485731,77.987666,374.000611,10.685732,1295.0,45.0,150.0,189.0,4.861111,5.250000
3,25.011003,19.555614,0.0,0.0,41.657614,2.593815,5.073291,41.657614,1.388587,4.401414,...,10.653086,84.662224,458.999231,10.674401,2180.0,54.0,180.0,227.0,5.833333,6.305556
4,26.682387,19.363781,0.0,0.0,45.470608,2.521928,5.029415,45.470608,1.377897,4.461703,...,10.656247,84.190602,444.127917,8.224591,3115.0,60.0,186.0,228.0,7.562500,7.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25867,54.897124,38.666048,0.0,0.0,89.900110,2.634022,5.254399,89.900110,1.341793,5.178760,...,11.505669,125.038251,865.364703,7.726471,18967.0,138.0,394.0,496.0,18.583333,14.111111
25868,59.547212,40.973357,0.0,0.0,97.851356,2.651211,5.293149,97.851356,1.359047,5.259799,...,11.600304,131.339531,925.401089,7.647943,21002.0,149.0,430.0,544.0,18.055556,15.083333
25869,35.869542,26.055696,0.0,0.0,59.185475,2.611216,5.207937,59.185475,1.345124,4.758445,...,11.133172,97.378661,564.243191,7.624908,6052.0,94.0,258.0,327.0,12.722222,9.333333
25870,45.285358,33.986476,0.0,0.0,74.086616,2.637793,5.250800,74.086616,1.322975,4.990565,...,11.380867,110.609564,743.295402,8.079298,10591.0,123.0,326.0,414.0,18.500000,11.888889


In [5]:
mordred.to_csv('result/mordred_ai.csv', index=False)