In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
# Morgan Fingerprint (radius = 2)
data = pd.read_csv("source/smiles.csv", index_col=0)

mf_2 = []
for i in range(0,100):
    mol_2 = Chem.MolFromSmiles(data["Smiles"][i])
    fp_2 = AllChem.GetMorganFingerprintAsBitVect(mol_2, 2, 2048)
    mf_2.append(np.array(fp_2, int))

df_2 = pd.DataFrame(mf_2)
df_2 = df_2.add_prefix('MF2_')
print(df_2.shape)
df_2_2 = remove_all_zero_col(df_2)
print(df_2_2.shape)
print(df_2_2)
df_2_2.to_csv('descriptors/MF.csv', index = False)

(100, 2048)
(100, 383)
    MF2_1  MF2_4  MF2_7  MF2_9  MF2_13  MF2_14  MF2_19  MF2_30  MF2_33  \
0       0      0      0      0       0       0       0       0       0   
1       0      0      0      0       0       0       0       0       1   
2       0      0      0      0       0       0       0       0       1   
3       0      0      0      0       0       0       0       0       1   
4       0      0      0      0       0       0       0       0       1   
..    ...    ...    ...    ...     ...     ...     ...     ...     ...   
95      0      0      0      0       0       0       0       0       0   
96      0      0      0      0       0       0       0       0       0   
97      0      0      0      0       0       0       0       0       0   
98      0      0      0      0       0       0       0       0       0   
99      0      0      0      0       0       0       0       0       1   

    MF2_37  ...  MF2_1993  MF2_1998  MF2_2005  MF2_2018  MF2_2021  MF2_2032  \
0        

In [4]:
# MACCSKeys
data = pd.read_csv("source/smiles.csv", index_col=0)

mk = []
for i in range(0,100):
    mol_MK = Chem.MolFromSmiles(data["Smiles"][i])
    fp_MK = MACCSkeys.GenMACCSKeys(mol_MK)
    fp_arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_MK, fp_arr)
    arr_MK = np.array(fp_arr)
    mk.append(arr_MK)

df_MK = pd.DataFrame(mk)
df_MK = df_MK.add_prefix('MK_')
print(df_MK.shape)
df_MK_2 = remove_all_zero_col(df_MK)
print(df_MK_2.shape)
print(df_MK_2)
df_MK_2.to_csv("descriptors/MK.csv", index=False)

(100, 167)
(100, 115)
    MK_18  MK_24  MK_26  MK_31  MK_36  MK_38  MK_41  MK_42  MK_44  MK_45  ...  \
0     0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  ...   
1     0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  ...   
2     0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  ...   
3     0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  ...   
4     0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  ...   
..    ...    ...    ...    ...    ...    ...    ...    ...    ...    ...  ...   
95    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
96    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
97    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
98    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   
99    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   

    M

In [5]:
# RDKit"MoleculseDescriptors"
data = pd.read_csv("source/smiles.csv", index_col=0)

def calculate_descriptors(mol, names=None):
    if names is None:
        names = [d[0] for d in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(names)
    descs = calc.CalcDescriptors(mol)
    return descs

md = []
for i in range(0,100):
    mol_MD = Chem.MolFromSmiles(data["Smiles"][i])
    desc_MD = calculate_descriptors(mol_MD)
    md.append(desc_MD)

df_MD = pd.DataFrame(md)
df_MD = df_MD.add_prefix('MD_')
print(df_MD.shape)
df_MD_2 = remove_all_zero_col(df_MD)
print(df_MD_2.shape)
print(df_MD_2)
df_MD_2.to_csv("descriptors/MD.csv", index=False)

(100, 208)
(100, 144)
         MD_0      MD_1       MD_2      MD_3      MD_4      MD_5      MD_6  \
0   12.116548  0.389761  12.116548  0.389761  0.178204   788.914   756.658   
1   13.185042 -0.171343  13.185042  0.168690  0.176200  1237.778  1141.010   
2   12.426856  0.399888  12.426856  0.399888  0.176567   901.130   852.746   
3   12.374043  0.319495  12.374043  0.319495  0.176198  1420.082  1395.890   
4   12.556354  0.210464  12.556354  0.210464  0.116076  1029.122   980.738   
..        ...       ...        ...       ...       ...       ...       ...   
95   6.297255  0.774805   6.297255  0.774805  0.343017   399.466   376.282   
96   2.240741  1.274259   2.240741  1.274259  0.382509   194.257   182.161   
97   2.274907  1.259308   2.274907  1.259308  0.354393   270.355   254.227   
98   2.298634  1.269051   2.298634  1.269051  0.326708   312.436   290.260   
99   2.483634  0.056328   2.483634  0.056328  0.172733   486.723   446.403   

           MD_7  MD_8     MD_10  ...  MD_

In [6]:
# mordred
data = pd.read_csv("source/smiles.csv", index_col=0)
data['Mol'] = data['Smiles'].apply(Chem.MolFromSmiles)

calc_mordred = Calculator(descriptors, ignore_3D=True)
desc_mordred = calc_mordred.pandas(data['Mol'])

mordred = desc_mordred.astype(str)
masks = mordred.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred = mordred[~masks]
mordred = mordred.astype(float)
df_mordred = mordred.dropna(how="any", axis="columns")

print(df_mordred.shape)
df_mordred_2 = remove_all_zero_col(df_mordred)
print(df_mordred_2.shape)
print(df_mordred_2)
df_mordred_2.to_csv("descriptors/mordred.csv", index=False)

  1%|          | 1/100 [00:03<05:32,  3.36s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 4/100 [00:04<03:20,  2.09s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██▎       | 23/100 [00:05<00:09,  8.27it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 100/100 [00:07<00:00, 12.93it/s]


(100, 1369)
(100, 1151)
              ABC      ABCGG  nBase     SpAbs_A   SpMax_A  SpDiam_A  \
Name                                                                  
OPS1    51.112698  37.059654    0.0   86.597960  2.689922  5.293219   
OPS2    77.061286  54.950651    0.0  116.949390  2.700910  5.308129   
OPS3    57.644671  41.246677    0.0   93.653582  2.695287  5.300279   
OPS4    57.644671  41.246677    0.0   93.653582  2.695287  5.300279   
OPS5    62.426407  44.258073    0.0  106.095220  2.696397  5.301785   
...           ...        ...    ...         ...       ...       ...   
OPS96   23.213203  16.981009    1.0   40.440782  2.432547  4.865094   
OPS97   11.968445   9.625522    1.0   20.264831  2.459954  4.919908   
OPS98   16.796872  13.157944    1.0   28.736047  2.534281  5.068561   
OPS99   19.165481  15.333598    1.0   31.429967  2.555884  5.111768   
OPS100  29.745439  22.966163    1.0   46.124046  2.600674  5.201348   

            SpAD_A   SpMAD_A   LogEE_A     VE1_A  ..