In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e1_01.csv")
smiles = data['SMILES']
smiles

0                            O=C1NC(=O)c2cc(-c3cncs3)ccc21
1                                  CNCc1ccc(S(C)(=O)=O)cc1
2               c1ccc2c(c1)sc1c(-c3c4ncsc4cc4ncsc34)cccc12
3        C1=Nc2c(cc3ncsc3c2-c2cccc3c2sc2ccccc23)[SH]1c1...
4        O=P(c1ccccc1)(c1ccccc1)c1cccc(-c2ccc3c(c2)oc2c...
                               ...                        
26102    Cc1cc(C)cc(-c2nc(-c3ccc(N(C)C)c(-c4ccc(C=C(C#N...
26103    CN(C)c1ccc(-c2coc(-n3c4ccccc4c4ccccc43)n2)c(-c...
26104    CN(C)c1ccc(-c2ncoc2-c2cc3ccccc3c3ccccc23)cc1-c...
26105    CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
26106    CN(Cc1c(F)c(F)c(F)c(F)c1F)c1ccc(-c2ncoc2-c2ccc...
Name: SMILES, Length: 26107, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|▏                                                                              | 57/26107 [00:04<22:08, 19.60it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▏                                                                              | 73/26107 [00:04<22:08, 19.60it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                              | 91/26107 [00:05<28:54, 15.00it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                              | 98/26107 [00:06<32:23, 13.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▍                                                                             | 159/26107 [00:08<25:51, 16.73it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▋                                                                             | 249/26107 [00:12<22:53, 18.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 279/26107 [00:13<17:45, 24.25it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 290/26107 [00:14<27:09, 15.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████| 26107/26107 [18:12<00:00, 23.90it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,12.784941,11.306906,0.0,0.0,21.099587,2.465486,4.800181,21.099587,1.318724,3.738438,...,9.754233,64.677355,230.014998,10.455227,418.0,23.0,88.0,106.0,4.666667,3.444444
1,9.607534,8.765714,0.0,1.0,15.185460,2.321935,4.643870,15.185460,1.168112,3.457903,...,9.278653,43.923001,199.066700,7.656412,262.0,16.0,62.0,68.0,5.784722,2.916667
2,20.849242,16.631104,0.0,0.0,34.706954,2.575194,5.044330,34.706954,1.388278,4.221171,...,10.485731,77.987666,374.000611,10.685732,1295.0,45.0,150.0,189.0,4.861111,5.250000
3,25.011003,19.555614,0.0,0.0,41.657614,2.593815,5.073291,41.657614,1.388587,4.401414,...,10.653086,84.662224,458.999231,10.674401,2180.0,54.0,180.0,227.0,5.833333,6.305556
4,26.682387,19.363781,0.0,0.0,45.470608,2.521928,5.029415,45.470608,1.377897,4.461703,...,10.656247,84.190602,444.127917,8.224591,3115.0,60.0,186.0,228.0,7.562500,7.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26102,25.691203,20.267844,0.0,0.0,42.197048,2.459940,4.855874,42.197048,1.278698,4.426210,...,10.371082,84.762918,450.151432,8.184571,3438.0,51.0,174.0,205.0,11.083333,7.305556
26103,40.762271,28.284576,0.0,0.0,67.814340,2.567072,5.097086,67.814340,1.356287,4.881322,...,11.113000,105.297909,648.288912,7.538243,9310.0,96.0,288.0,357.0,12.722222,10.638889
26104,42.624371,28.307679,0.0,0.0,71.399210,2.615507,5.126771,71.399210,1.373062,4.929763,...,11.234045,107.720301,666.216809,8.541241,10568.0,105.0,306.0,384.0,12.944444,11.083333
26105,37.785616,27.318137,0.0,0.0,61.431550,2.578790,5.059272,61.431550,1.307054,4.803096,...,10.965211,101.750925,627.280967,7.649768,7999.0,86.0,264.0,322.0,14.972222,9.972222


In [5]:
mordred.to_csv('result/mordred_ai.csv', index=False)