In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e01.csv")
smiles = data['SMILES']
smiles

0                          CN(C)c1ccc(-c2ccc(N(C)C)cc2)cc1
1          CN(C)c1ccc(-c2ccc(N(C)C)cc2-c2ccc(N(C)C)cc2)cc1
2                     CN(C)c1ccc(-c2ccc(C=C(C#N)C#N)s2)cc1
3        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2ccc(C=C(C#N)C#N)...
4        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
                               ...                        
26085    CN(Cc1ccc2c(c1)sc1ncccc12)c1ccc(-c2c(F)c(F)c(F...
26086    CN(C)c1ccc(-c2c(F)c(F)c(F)c(F)c2F)c(-c2ccc(N(C...
26087    CN(C)c1ccc(-c2c(-c3c(F)c(F)c(F)c(F)c3F)ccc(N(C...
26088    CN(C)c1cc(-c2c(F)c(F)c(F)c(F)c2F)cc(N(c2ccccc2...
26089    CN(C)c1ccc(-c2c(-c3c(F)c(F)c(F)c(F)c3F)ccc(N(C...
Name: SMILES, Length: 26090, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|                                                                               | 19/26090 [00:03<55:22,  7.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                             | 108/26090 [00:07<22:20, 19.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                             | 112/26090 [00:08<25:54, 16.71it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|▎                                                                             | 119/26090 [00:08<30:43, 14.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▋                                                                             | 222/26090 [00:13<21:09, 20.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▋                                                                             | 246/26090 [00:14<18:22, 23.44it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 252/26090 [00:14<18:52, 22.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▊                                                                             | 261/26090 [00:15<23:42, 18.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▉                                                                             | 311/26090 [00:18<23:32, 18.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|▉                                                                             | 316/26090 [00:19<37:36, 11.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|█                                                                             | 343/26090 [00:22<33:06, 12.96it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████| 26090/26090 [50:22<00:00,  8.63it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,13.751268,11.174438,0.0,0.0,22.772325,2.347941,4.695881,22.772325,1.265129,3.804076,...,9.647240,50.768505,240.162649,6.320070,671.0,26.0,90.0,103.0,6.666667,4.000000
1,20.919795,16.309248,0.0,0.0,34.360705,2.442429,4.884859,34.360705,1.272619,4.216663,...,10.193130,62.151002,359.236148,6.414931,1884.0,44.0,140.0,164.0,9.861111,5.916667
2,14.987142,12.791073,0.0,0.0,25.490756,2.369479,4.664554,25.490756,1.274538,3.902730,...,9.642318,66.589086,279.083018,8.457061,922.0,27.0,98.0,112.0,7.166667,4.638889
3,22.155669,17.919982,0.0,0.0,37.081141,2.452734,4.876763,37.081141,1.278660,4.283043,...,10.190282,77.885647,398.156518,7.806991,2343.0,45.0,148.0,173.0,10.361111,6.555556
4,29.324196,22.940203,0.0,0.0,48.670321,2.493568,4.973525,48.670321,1.280798,4.557907,...,10.542522,88.447926,517.230017,7.496087,4592.0,63.0,198.0,234.0,13.555556,8.472222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26085,26.374448,19.211714,0.0,0.0,43.369155,2.489836,4.963846,43.369155,1.314217,4.448845,...,10.624128,84.096217,470.087611,9.793492,3575.0,61.0,184.0,225.0,10.805556,7.055556
26086,32.623340,26.130410,0.0,0.0,52.507335,2.543171,5.086342,52.507335,1.250175,4.661386,...,10.880516,79.913866,597.182652,9.048222,5980.0,83.0,226.0,276.0,18.222222,9.083333
26087,35.123598,28.538752,0.0,0.0,57.591264,2.566444,5.132887,57.591264,1.279806,4.735595,...,10.928435,83.212920,616.262538,7.900802,6038.0,87.0,242.0,295.0,16.861111,9.833333
26088,27.995511,23.302191,0.0,0.0,45.837159,2.534827,5.069653,45.837159,1.273254,4.510830,...,10.682331,73.032535,497.189039,8.286484,3455.0,68.0,192.0,233.0,13.666667,7.888889


In [5]:
mordred.to_csv('result/mordred_ai2.csv', index=False)