In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e01.csv")
smiles = data['SMILES']
smiles

0                          CN(C)c1ccc(-c2ccc(N(C)C)cc2)cc1
1          CN(C)c1ccc(-c2ccc(N(C)C)cc2-c2ccc(N(C)C)cc2)cc1
2                     CN(C)c1ccc(-c2ccc(C=C(C#N)C#N)s2)cc1
3        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2ccc(C=C(C#N)C#N)...
4        CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
                               ...                        
25540    COc1cc(OCn2c3ccc(Cl)cc3c3cc(Cl)ccc32)cc(OC)c1-...
25541    CN(C)c1ccc(-c2ccccn2)c(-c2ccc(N(C)C)c(-c3ccc(C...
25542    Sc1nc2cc3nc(-c4cc(-n5c6ccccc6c6cnccc65)c(-c5cc...
25543    CN(C)c1ccc(-c2c3[nH]c(S)nc3cc3nc(-c4cc(-n5c6cc...
25544          CN(CN(c1ccccc1)c1ccccc1)c1ccc(-c2ccccc2)cc1
Name: SMILES, Length: 25545, dtype: object

In [4]:
data['Mol'] = data['SMILES'].apply(Chem.MolFromSmiles)
calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])
mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)

mordred

  0%|          | 127/25545 [00:07<19:22, 21.86it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 141/25545 [00:07<19:59, 21.19it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 157/25545 [00:09<35:57, 11.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 159/25545 [00:10<49:14,  8.59it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 183/25545 [00:10<1:01:38,  6.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 192/25545 [00:11<21:07, 20.00it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 199/25545 [00:11<22:43, 18.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 203/25545 [00:12<25:55, 16.30it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 220/25545 [00:13<43:10,  9.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 228/25545 [00:13<21:24, 19.71it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 292/25545 [00:16<19:12, 21.91it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 25545/25545 [23:23<00:00, 18.20it/s] 


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,13.751268,11.174438,0.0,0.0,22.772325,2.347941,4.695881,22.772325,1.265129,3.804076,...,9.647240,50.768505,240.162649,6.320070,671.0,26.0,90.0,103.0,6.666667,4.000000
1,20.919795,16.309248,0.0,0.0,34.360705,2.442429,4.884859,34.360705,1.272619,4.216663,...,10.193130,62.151002,359.236148,6.414931,1884.0,44.0,140.0,164.0,9.861111,5.916667
2,14.987142,12.791073,0.0,0.0,25.490756,2.369479,4.664554,25.490756,1.274538,3.902730,...,9.642318,66.589086,279.083018,8.457061,922.0,27.0,98.0,112.0,7.166667,4.638889
3,22.155669,17.919982,0.0,0.0,37.081141,2.452734,4.876763,37.081141,1.278660,4.283043,...,10.190282,77.885647,398.156518,7.806991,2343.0,45.0,148.0,173.0,10.361111,6.555556
4,29.324196,22.940203,0.0,0.0,48.670321,2.493568,4.973525,48.670321,1.280798,4.557907,...,10.542522,88.447926,517.230017,7.496087,4592.0,63.0,198.0,234.0,13.555556,8.472222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25540,28.357869,20.876362,0.0,0.0,47.144569,2.546339,4.984738,47.144569,1.309571,4.525809,...,10.645925,87.613757,520.132048,8.389227,4224.0,64.0,196.0,237.0,11.555556,7.916667
25541,35.376567,26.981702,0.0,0.0,59.294781,2.534835,4.997121,59.294781,1.317662,4.746442,...,10.838620,99.244118,603.209282,8.151477,7410.0,80.0,244.0,296.0,14.138889,9.944444
25542,31.484354,22.092970,0.0,0.0,52.272578,2.579193,5.031411,52.272578,1.375594,4.623886,...,10.830738,93.215856,510.137514,9.109598,4389.0,69.0,224.0,278.0,8.166667,7.972222
25543,38.612441,26.693184,0.0,0.0,63.922066,2.586930,5.041227,63.922066,1.360044,4.826898,...,11.046611,103.255997,629.211013,8.502852,7853.0,88.0,274.0,340.0,11.361111,9.916667


In [5]:
mordred.to_csv('result/mordred_ai2.csv', index=False)