# Calculate normal descriptors 
    - 2D descriptor
    - 3D descriptor
    - Fingerprint

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import normalize,StandardScaler


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Descriptors import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors


#load database
path="../database/small_db.csv"
df=pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,boiling temperature,melting temperature,density,viscosity,SMILES
0,0,-0.999826,-1.072944,-1.466345,-0.857140,CCCCCC
1,1,-0.407466,-0.997391,-1.355887,-0.708313,CCCCCCC
2,2,0.113976,-0.218786,-1.268441,-0.613162,CCCCCCCC
3,3,-0.769429,0.961131,-0.918658,-0.293072,C1CCCCC1
4,4,-0.769429,0.944862,-0.458417,-0.422331,c1ccccc1
...,...,...,...,...,...,...
157,157,0.230720,-1.437014,0.098475,-0.366290,CCC[N+](=O)[O-]
158,158,-0.422740,2.176619,-0.914055,-0.745890,CCC#N
159,159,-0.029463,2.408080,-0.775983,-0.608380,CCCC#N
160,160,-0.292849,1.903800,-1.006104,-0.668010,CC(C)C#N


In [2]:
#make mol objects
smiles_list=df["SMILES"]
mol_list=[Chem.MolFromSmiles(s) for s in smiles_list]


In [3]:
#normal 2D descriptor

desc_list = [desc_name[0] for desc_name in Descriptors.descList]
rdkit_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)
rdkit_desc_list=[rdkit_calculator.CalcDescriptors(m) for m in mol_list]

rdkit_df=pd.DataFrame(rdkit_desc_list)
rdkit_df.columns=desc_list
rdkit_df["SMILES"]=smiles_list


rdkit_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,SMILES
0,2.231806,1.355000,2.231806,1.355000,0.462954,86.178,72.066,86.109550,38,0,...,0,0,0,0,0,0,0,1,0,CCCCCC
1,2.245694,1.361111,2.245694,1.361111,0.476310,100.205,84.077,100.125201,44,0,...,0,0,0,0,0,0,0,2,0,CCCCCCC
2,2.255899,1.364796,2.255899,1.364796,0.480611,114.232,96.088,114.140851,50,0,...,0,0,0,0,0,0,0,3,0,CCCCCCCC
3,1.500000,1.500000,1.500000,1.500000,0.422316,84.162,72.066,84.093900,36,0,...,0,0,0,0,0,0,0,0,0,C1CCCCC1
4,2.000000,2.000000,2.000000,2.000000,0.442628,78.114,72.066,78.046950,30,0,...,0,0,0,0,0,0,0,0,0,c1ccccc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,9.404861,-0.319444,9.404861,0.097222,0.368378,89.094,82.038,89.047678,36,0,...,0,0,0,0,0,0,0,0,0,CCC[N+](=O)[O-]
158,7.625000,0.625000,7.625000,0.625000,0.403814,55.080,50.040,55.042199,22,0,...,0,0,0,0,0,0,0,0,0,CCC#N
159,7.816250,0.694444,7.816250,0.694444,0.455070,69.107,62.051,69.057849,28,0,...,0,0,0,0,0,0,0,0,0,CCCC#N
160,7.893519,0.189815,7.893519,0.189815,0.418494,69.107,62.051,69.057849,28,0,...,0,0,0,0,0,0,0,0,0,CC(C)C#N


In [4]:
#3D descroptors (dragon descriptor)
def calc_3d_desc(m):
    m = Chem.AddHs(m)
    AllChem.EmbedMolecule(m, AllChem.ETKDGv2())
    return rdMolDescriptors.CalcAUTOCORR3D(m) + rdMolDescriptors.CalcMORSE(m) + \
        rdMolDescriptors.CalcRDF(m) + rdMolDescriptors.CalcWHIM(m)


dragon_df=pd.DataFrame([calc_3d_desc(m) for m in mol_list])
dragon_df.columns=["desc_dragon"+str(i) for i in range(dragon_df.shape[1])]
dragon_df["SMILES"]=smiles_list
dragon_df

Unnamed: 0,desc_dragon0,desc_dragon1,desc_dragon2,desc_dragon3,desc_dragon4,desc_dragon5,desc_dragon6,desc_dragon7,desc_dragon8,desc_dragon9,...,desc_dragon619,desc_dragon620,desc_dragon621,desc_dragon622,desc_dragon623,desc_dragon624,desc_dragon625,desc_dragon626,desc_dragon627,SMILES
0,0.122,0.396,0.668,0.671,0.690,0.551,0.333,0.000,0.000,0.0,...,0.629,0.476,16.005,7.343,10.376,15.752,11.813,16.775,14.224,CCCCCC
1,0.106,0.348,0.606,0.632,0.620,0.556,0.408,0.226,0.000,0.0,...,0.558,0.460,21.759,10.676,14.563,21.436,16.401,22.744,19.561,CCCCCCC
2,0.094,0.310,0.552,0.592,0.561,0.537,0.449,0.313,0.163,0.0,...,0.563,0.477,26.072,13.293,17.782,25.700,19.900,27.207,23.546,CCCCCCCC
3,0.146,0.503,0.959,0.879,0.334,0.000,0.000,0.000,0.000,0.0,...,0.541,0.428,11.502,4.591,6.823,11.277,7.951,12.198,9.965,C1CCCCC1
4,0.224,0.606,0.962,0.737,0.224,0.000,0.000,0.000,0.000,0.0,...,0.496,0.281,7.951,3.458,4.684,7.765,5.372,8.547,5.993,c1ccccc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.190,0.570,0.884,0.691,0.298,0.000,0.000,0.000,0.000,0.0,...,0.500,0.597,8.240,6.214,6.197,8.497,6.110,8.587,9.899,CCC[N+](=O)[O-]
158,0.269,0.758,0.970,0.324,0.000,0.000,0.000,0.000,0.000,0.0,...,0.492,0.556,5.091,3.690,3.808,5.205,3.908,5.460,6.024,CCC#N
159,0.203,0.609,0.919,0.656,0.243,0.000,0.000,0.000,0.000,0.0,...,0.533,0.574,7.409,5.308,5.581,7.537,5.777,7.889,8.926,CCCC#N
160,0.204,0.609,0.934,0.815,0.000,0.000,0.000,0.000,0.000,0.0,...,0.424,0.550,7.827,5.327,5.747,7.975,5.979,8.378,9.122,CC(C)C#N


In [5]:
#fingerprint
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

def calc_fp(m):
    fp=GetAvalonFP(m).ToBitString()
    fp=[float(i) for i in fp]
    return fp
fp_list=[calc_fp(m) for m in mol_list]

fp_df=pd.DataFrame(fp_list)
fp_df.columns=["fp_avalon"+str(i) for i in range(fp_df.shape[1])]
fp_df["SMILES"]=smiles_list
fp_df

Unnamed: 0,fp_avalon0,fp_avalon1,fp_avalon2,fp_avalon3,fp_avalon4,fp_avalon5,fp_avalon6,fp_avalon7,fp_avalon8,fp_avalon9,...,fp_avalon503,fp_avalon504,fp_avalon505,fp_avalon506,fp_avalon507,fp_avalon508,fp_avalon509,fp_avalon510,fp_avalon511,SMILES
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCCCCC
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCCCCCC
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCCCCCCC
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C1CCCCC1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,c1ccccc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,CCC[N+](=O)[O-]
158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCC#N
159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCCC#N
160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CC(C)C#N
