In [14]:
import pandas as pd
import numpy as np
import rdkit
import padelpy
import PaDEL_pywrapper
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper.descriptor import WienerNumbers, ZagrebIndex, Topological, TopologicalCharge, VAdjMa, PetitjeanNumber
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
from padelpy import from_smiles

In [15]:
input_file = '../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)
print(df.head())

  Molecule ChEMBL ID  Molecular Weight  #RO5 Violations  AlogP Compound Key  \
0      CHEMBL3235962            421.42              1.0   5.76           22   
1      CHEMBL3235983            434.36              1.0   5.45           44   
2      CHEMBL1650511            467.41              1.0   7.09            5   
3      CHEMBL2443068            438.83              1.0   5.39           9b   
4      CHEMBL3959823            358.44              0.0   3.86           9n   

                                              Smiles Standard Type  \
0  N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...          IC50   
1  C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...          IC50   
2  FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...          IC50   
3  O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...          IC50   
4  Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...          IC50   

   Standard Value Standard Units  pChEMBL Value  ...  Action Type  \
0          83.000             nM   

In [16]:
column_names = df.columns
column_names

Index(['Molecule ChEMBL ID', 'Molecular Weight', '#RO5 Violations', 'AlogP',
       'Compound Key', 'Smiles', 'Standard Type', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
       'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Action Type',
       'InChI', 'Potency', 'Molecular Weight_standardized',
       'AlogP_standardized', 'Standard Value_standardized',
       'pChEMBL Value_standardized', 'Ligand Efficiency LE_standardized',
       'Ligand Efficiency LLE_standardized',
       'Ligand Efficiency SEI_standardized'],
      dtype='object')

In [17]:
smiles_list = df["Smiles"].tolist()
smiles_list_chem = [Chem.MolFromSmiles(x) for x in smiles_list]


descriptors = [WienerNumbers, ZagrebIndex, Topological, TopologicalCharge, VAdjMa, PetitjeanNumber]
padel = PaDEL(descriptors)
padel_dict = padel.calculate(smiles_list_chem)
padel_df = pd.DataFrame.from_dict(padel_dict)

PaDEL-Descriptor is a software for calculating molecular
descriptors and fingerprints. The software calculates
1875 descriptors (1444 1D and 2D descriptors, and 431
3D descriptors) and 12 types of fingerprints.

###################################

Should you publish results based on the PaDEL descriptors,
please cite:

Yap, C.W. (2011), PaDEL-descriptor: An open source software
to calculate molecular descriptors and fingerprints.
J. Comput. Chem., 32: 1466-1474. https://doi.org/10.1002/jcc.21707

###################################





In [18]:
def calculate_topological_descriptors(smiles):
    m = Chem.MolFromSmiles(smiles)
    dict = {}
    #Trying to use a dictionary
    
    dict["Molecule ChEMBL ID"] = df["Molecule ChEMBL ID"]
    dict["Standard Value"] = df["Standard Value"]
    dict["Smiles"] = df["Smiles"]
    dict["BalabanJ"] = Descriptors.BalabanJ(m)
    dict["TPSA"] = Descriptors.TPSA(m)
    dict["Ipc"] = Descriptors.Ipc(m)
    dict["HallKierAlpha"] = Descriptors.HallKierAlpha(m)
    dict["Kappa1"] = Descriptors.Kappa1(m)
    dict["Kappa2"] = Descriptors.Kappa2(m)
    dict["Kappa3"] = Descriptors.Kappa3(m)
    dict["Chi0"] = Descriptors.Chi0(m)
    dict["Chi1"] = Descriptors.Chi1(m)
    dict["Chi0n"] = Descriptors.Chi0n(m)
    dict["Chi1n"] = Descriptors.Chi1n(m)
    dict["Chi2n"] = Descriptors.Chi2n(m)
    dict["Chi3n"] = Descriptors.Chi3n(m)
    dict["Chi4n"] = Descriptors.Chi4n(m)
    dict["Chi0v"] = Descriptors.Chi0v(m)
    dict["Chi1v"] = Descriptors.Chi1v(m)
    dict["Chi2v"] = Descriptors.Chi2v(m)
    dict["Chi3v"] = Descriptors.Chi3v(m)
    dict["Chi4v"] = Descriptors.Chi4v(m)
    dict["PEOE_VSA1"] = Descriptors.PEOE_VSA1(m)
    dict["PEOE_VSA2"] = Descriptors.PEOE_VSA2(m)
    dict["PEOE_VSA3"] = Descriptors.PEOE_VSA3(m)
    dict["PEOE_VSA4"] = Descriptors.PEOE_VSA4(m)
    dict["PEOE_VSA5"] = Descriptors.PEOE_VSA5(m)
    dict["PEOE_VSA6"] = Descriptors.PEOE_VSA6(m)
    dict["PEOE_VSA7"] = Descriptors.PEOE_VSA7(m)
    dict["PEOE_VSA8"] = Descriptors.PEOE_VSA8(m)
    dict["PEOE_VSA9"] = Descriptors.PEOE_VSA9(m)
    dict["PEOE_VSA10"] = Descriptors.PEOE_VSA10(m)
    dict["PEOE_VSA11"] = Descriptors.PEOE_VSA11(m)
    dict["PEOE_VSA12"] = Descriptors.PEOE_VSA12(m)
    dict["PEOE_VSA13"] = Descriptors.PEOE_VSA13(m)
    dict["PEOE_VSA14"] = Descriptors.PEOE_VSA14(m)
    
    return dict

smiles_list = df["Smiles"].tolist()
rdk_topological_dict = [calculate_topological_descriptors(x) for x in smiles_list]
rdk_df = pd.DataFrame.from_dict(rdk_topological_dict)
topological_df = pd.concat([rdk_df, padel_df], axis = 1)
topological_df


Unnamed: 0,Molecule ChEMBL ID,Standard Value,Smiles,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT,VAdjMat,WPATH,WPOL,Zagreb
0,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,1.690515,56.13,1.039014e+07,-3.79,20.481470,8.149739,4.237594,...,0.017088,0.016634,0.011785,0.008406,0.008588,0.482208,5.954196,2680.0,52.0,166.0
1,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,2.023307,32.34,2.699429e+06,-2.78,21.917604,7.806619,4.364645,...,0.023407,0.022724,0.018507,0.011464,0.012370,0.609510,5.906891,2349.0,52.0,162.0
2,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,1.612535,50.27,3.098475e+07,-3.04,21.787307,7.495135,3.954292,...,0.023434,0.019966,0.012460,0.010282,0.007505,0.579610,6.044394,3076.0,57.0,190.0
3,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,1.392979,58.64,4.780479e+06,-2.74,20.529456,7.501947,3.961512,...,0.012577,0.021862,0.014073,0.007778,0.009110,0.608409,5.906891,2757.0,52.0,166.0
4,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,2.046188,63.40,1.415232e+06,-3.40,18.392847,8.319859,4.416427,...,0.014694,0.008605,0.008151,0.006562,0.004938,0.378021,5.754888,1730.0,41.0,136.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,1.551374,57.26,7.802733e+05,-2.29,18.499572,7.527031,4.456538,...,0.023059,0.010995,0.013450,0.010074,0.007778,0.544606,5.700440,1961.0,37.0,136.0
650,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,2.335188,59.42,1.494366e+04,-1.59,12.856309,5.370257,2.626550,...,0.017755,0.017284,0.012818,0.009125,0.014437,0.511274,5.169925,637.0,25.0,90.0
651,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,2.020584,102.50,4.517308e+04,-2.42,11.658106,4.687837,2.147235,...,0.013835,0.010978,0.008736,0.005000,0.006870,0.408149,5.247928,725.0,26.0,100.0
652,0 CHEMBL3235962 1 CHEMBL3235983 2 ...,0 83.000 1 10.000 2 0.41...,0 N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc...,2.332605,50.19,2.005623e+04,-1.75,13.676346,5.484467,3.172661,...,0.020375,0.018229,0.010254,0.014063,0.006173,0.535801,5.247928,731.0,26.0,96.0


In [21]:
topological_df.to_csv('topological-descriptors.csv',index=False)

In [22]:
#Standardize values
topological_df_scaled = topological_df.copy()
scalar_columns = [col for col in topological_df_scaled.columns if 'Molecule ChEMBL ID' not in col 
                                                             and 'Standard Value' not in col
                                                             and 'Smile' not in col]

# Apply StandardScaler to scalar descriptors
scaler = StandardScaler()
topological_df_scaled[scalar_columns] = scaler.fit_transform(topological_df_scaled[scalar_columns])

# Output the scaled dataframe
topological_df_scaled.head()
output_file = 'topological-descriptors-standardized.csv'
topological_df_scaled.to_csv(output_file, index=False)