In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e1_01.csv")
smiles = data['SMILES']
smiles

0                            O=C1NC(=O)c2cc(-c3cncs3)ccc21
1                                  CNCc1ccc(S(C)(=O)=O)cc1
2               c1ccc2c(c1)sc1c(-c3c4ncsc4cc4ncsc34)cccc12
3        C1=Nc2c(cc3ncsc3c2-c2cccc3c2sc2ccccc23)[SH]1c1...
4        O=P(c1ccccc1)(c1ccccc1)c1cccc(-c2ccc3c(c2)oc2c...
                               ...                        
28209    CNC(c1ccc(N2CCOC(c3c4ncoc4cc4ncoc34)C2)cc1)C1c...
28210    CN(C)c1ccc(N(C)C(c2ccc(N3CCOC(c4c5ncoc5cc5ncoc...
28211      O=Cc1ccc(C(=O)c2cocn2)cc1-c1ccc2c(c1)oc1ncccc12
28212    O=Cc1ccc(C(=O)c2cocn2)cc1-c1ccc2c(c1)oc1nc(-c3...
28213    Cc1cc(C)c(B(c2nc(-c3ocnc3C(=O)c3ccc(C=O)cc3)cc...
Name: SMILES, Length: 28214, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|          | 73/28214 [00:03<24:18, 19.29it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 90/28214 [00:04<17:28, 26.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 92/28214 [00:04<18:16, 25.65it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 98/28214 [00:05<18:52, 24.83it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 132/28214 [00:05<14:02, 33.34it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 141/28214 [00:06<16:38, 28.11it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 197/28214 [00:07<15:49, 29.52it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 230/28214 [00:08<11:20, 41.13it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 237/28214 [00:08<14:08, 32.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▏         | 629/28214 [00:18<09:43, 47.24it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 28214/28214 [20:25<00:00, 23.03it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,12.784941,11.306906,0.0,0.0,21.099587,2.465486,4.800181,21.099587,1.318724,3.738438,...,9.754233,64.677355,230.014998,10.455227,418.0,23.0,88.0,106.0,4.666667,3.444444
1,9.607534,8.765714,0.0,1.0,15.185460,2.321935,4.643870,15.185460,1.168112,3.457903,...,9.278653,43.923001,199.066700,7.656412,262.0,16.0,62.0,68.0,5.784722,2.916667
2,20.849242,16.631104,0.0,0.0,34.706954,2.575194,5.044330,34.706954,1.388278,4.221171,...,10.485731,77.987666,374.000611,10.685732,1295.0,45.0,150.0,189.0,4.861111,5.250000
3,25.011003,19.555614,0.0,0.0,41.657614,2.593815,5.073291,41.657614,1.388587,4.401414,...,10.653086,84.662224,458.999231,10.674401,2180.0,54.0,180.0,227.0,5.833333,6.305556
4,26.682387,19.363781,0.0,0.0,45.470608,2.521928,5.029415,45.470608,1.377897,4.461703,...,10.656247,84.190602,444.127917,8.224591,3115.0,60.0,186.0,228.0,7.562500,7.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28209,32.708297,23.407049,0.0,1.0,55.258432,2.559919,5.017663,55.258432,1.381461,4.667890,...,10.878707,95.214299,528.216141,7.767884,5544.0,74.0,232.0,289.0,8.666667,8.611111
28210,39.986214,28.154646,0.0,0.0,66.837288,2.575711,5.042815,66.837288,1.364026,4.862794,...,11.057771,105.158967,647.289640,7.526624,9225.0,91.0,282.0,349.0,11.861111,10.444444
28211,22.453726,17.283941,0.0,0.0,38.043057,2.506451,4.930011,38.043057,1.358681,4.294932,...,10.385759,79.673155,368.079707,9.201993,2040.0,47.0,156.0,190.0,7.111111,6.111111
28212,38.434101,24.885273,0.0,0.0,64.805014,2.556794,4.983538,64.805014,1.378830,4.825148,...,11.012232,103.080664,609.168856,8.702412,9427.0,87.0,272.0,337.0,10.750000,10.055556


In [5]:
mordred.to_csv('result/mordred_ai.csv', index=False)