In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e01.csv")
smiles = data['SMILES']
smiles

0                          CN(C)c1ccc(-c2ccc(N(C)C)cc2)cc1
1          CN(C)c1ccc(-c2ccc(N(C)C)cc2-c2ccc(N(C)C)cc2)cc1
2                     CN(C)c1ccc(-c2ccc(C=C(C#N)C#N)s2)cc1
3        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2ccc(C=C(C#N)C#N)...
4        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
                               ...                        
28422    CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(C3c4ccccc4-c4...
28423    N#CC(C#N)=C1c2ccccc2C(=O)c2cc(-c3cccc(-c4ccc5c...
28424    CN(C)c1ccc(-c2ccc3c(c2)c2ccccc2c2nc(C#N)c(C#N)...
28425    CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
28426    Cc1ccc(N(c2ccc3c(c2)sc2ccccc23)c2ccc(C)c(-c3cc...
Name: SMILES, Length: 28427, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|                                                                               | 37/28427 [00:04<34:13, 13.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|                                                                               | 41/28427 [00:04<27:52, 16.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████| 28427/28427 [28:10<00:00, 16.82it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,13.751268,11.174438,0.0,0.0,22.772325,2.347941,4.695881,22.772325,1.265129,3.804076,...,9.647240,50.768505,240.162649,6.320070,671.0,26.0,90.0,103.0,6.666667,4.000000
1,20.919795,16.309248,0.0,0.0,34.360705,2.442429,4.884859,34.360705,1.272619,4.216663,...,10.193130,62.151002,359.236148,6.414931,1884.0,44.0,140.0,164.0,9.861111,5.916667
2,14.987142,12.791073,0.0,0.0,25.490756,2.369479,4.664554,25.490756,1.274538,3.902730,...,9.642318,66.589086,279.083018,8.457061,922.0,27.0,98.0,112.0,7.166667,4.638889
3,22.155669,17.919982,0.0,0.0,37.081141,2.452734,4.876763,37.081141,1.278660,4.283043,...,10.190282,77.885647,398.156518,7.806991,2343.0,45.0,148.0,173.0,10.361111,6.555556
4,29.324196,22.940203,0.0,0.0,48.670321,2.493568,4.973525,48.670321,1.280798,4.557907,...,10.542522,88.447926,517.230017,7.496087,4592.0,63.0,198.0,234.0,13.555556,8.472222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28422,35.893403,25.489619,0.0,0.0,59.454286,2.612214,5.105883,59.454286,1.351234,4.754997,...,10.984496,98.962676,586.244270,7.515952,6200.0,84.0,254.0,315.0,11.500000,9.333333
28423,31.443914,21.499324,0.0,0.0,53.389289,2.546887,5.092938,53.389289,1.368956,4.629836,...,10.860555,91.117449,514.113984,9.019544,5165.0,75.0,222.0,275.0,9.916667,8.500000
28424,48.378685,32.603297,0.0,0.0,81.502809,2.590713,5.181427,81.502809,1.358380,5.055363,...,11.342007,100.064603,774.321943,7.901244,14275.0,122.0,342.0,425.0,16.166667,13.000000
28425,41.809477,27.711495,0.0,0.0,69.453253,2.550697,5.044563,69.453253,1.335639,4.903188,...,11.037451,105.302048,702.339233,7.166727,11307.0,95.0,290.0,352.0,14.722222,11.138889


In [5]:
mordred.to_csv('result/mordred_ai2.csv', index=False)