In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e1.csv")
smiles = data['SMILES']
smiles

0                            O=C1NC(=O)c2cc(-c3cncs3)ccc21
1                                  CNCc1ccc(S(C)(=O)=O)cc1
2               c1ccc2c(c1)sc1c(-c3c4ncsc4cc4ncsc34)cccc12
3        C1=Nc2c(cc3ncsc3c2-c2cccc3c2sc2ccccc23)[SH]1c1...
4        O=P(c1ccccc1)(c1ccccc1)c1cccc(-c2ccc3c(c2)oc2c...
                               ...                        
29618        O=S(=O)(Cn1c2ccc(Cl)cc2c2cc(Cl)ccc21)c1ccccc1
29619    N#CC(C#N)=C1C(=Cc2ccc(-c3cccc(S(=O)(=O)Cn4c5cc...
29620    Clc1ccc2c(c1)c1cc(Cl)ccc1n2Cn1ccc(-c2cc3c(cn2)...
29621    Clc1ccc2c(c1)c1cc(Cl)cc(-c3c4ncsc4cc4ncsc34)c1...
29622      COc1cc(OCc2ccc(N(C)C)cc2)cc(C2=CC(=O)C=CC2=O)c1
Name: SMILES, Length: 29623, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|▏                                                                              | 49/29623 [00:03<34:10, 14.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▏                                                                              | 73/29623 [00:04<25:20, 19.44it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▏                                                                              | 92/29623 [00:05<27:29, 17.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                              | 98/29623 [00:06<33:05, 14.87it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                             | 141/29623 [00:08<28:08, 17.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▌                                                                             | 237/29623 [00:12<24:43, 19.80it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▋                                                                             | 276/29623 [00:13<20:23, 23.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 290/29623 [00:14<28:37, 17.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|█▍                                                                            | 538/29623 [00:23<21:13, 22.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████| 29623/29623 [23:16<00:00, 21.22it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,12.784941,11.306906,0.0,0.0,21.099587,2.465486,4.800181,21.099587,1.318724,3.738438,...,9.754233,64.677355,230.014998,10.455227,418.0,23.0,88.0,106.0,4.666667,3.444444
1,9.607534,8.765714,0.0,1.0,15.185460,2.321935,4.643870,15.185460,1.168112,3.457903,...,9.278653,43.923001,199.066700,7.656412,262.0,16.0,62.0,68.0,5.784722,2.916667
2,20.849242,16.631104,0.0,0.0,34.706954,2.575194,5.044330,34.706954,1.388278,4.221171,...,10.485731,77.987666,374.000611,10.685732,1295.0,45.0,150.0,189.0,4.861111,5.250000
3,25.011003,19.555614,0.0,0.0,41.657614,2.593815,5.073291,41.657614,1.388587,4.401414,...,10.653086,84.662224,458.999231,10.674401,2180.0,54.0,180.0,227.0,5.833333,6.305556
4,26.682387,19.363781,0.0,0.0,45.470608,2.521928,5.029415,45.470608,1.377897,4.461703,...,10.656247,84.190602,444.127917,8.224591,3115.0,60.0,186.0,228.0,7.562500,7.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29618,20.071797,15.950093,0.0,0.0,32.365655,2.559125,4.944904,32.365655,1.294626,4.175950,...,10.356377,75.032267,389.004405,10.236958,1386.0,43.0,140.0,169.0,7.951389,5.263889
29619,37.575775,26.627446,0.0,0.0,61.959946,2.560598,4.982955,61.959946,1.318297,4.803426,...,10.988119,102.213104,669.068068,9.839236,9123.0,87.0,264.0,323.0,14.451389,10.125000
29620,28.139090,19.695473,0.0,0.0,46.172034,2.553300,4.893372,46.172034,1.358001,4.511012,...,10.698085,88.620220,496.067778,9.539765,3644.0,60.0,200.0,247.0,8.055556,7.083333
29621,38.503051,27.340598,0.0,0.0,62.867219,2.616979,5.109214,62.867219,1.366679,4.823918,...,11.090889,103.676581,686.033719,10.394450,7549.0,86.0,278.0,349.0,10.222222,9.500000


In [5]:
mordred.to_csv('result/mordred_random.csv', index=False)