In [43]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors

In [44]:
input_file = '../TRPM8-bootcamp-project/1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)
print(df.head())

  Molecule ChEMBL ID  Molecular Weight  #RO5 Violations  AlogP Compound Key  \
0      CHEMBL3235962            421.42              1.0   5.76           22   
1      CHEMBL3235983            434.36              1.0   5.45           44   
2      CHEMBL1650511            467.41              1.0   7.09            5   
3      CHEMBL2443068            438.83              1.0   5.39           9b   
4      CHEMBL3959823            358.44              0.0   3.86           9n   

                                              Smiles Standard Type  \
0  N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...          IC50   
1  C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...          IC50   
2  FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...          IC50   
3  O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...          IC50   
4  Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...          IC50   

   Standard Value Standard Units  pChEMBL Value  ...  Action Type  \
0          83.000             nM   

In [46]:
column_names = df.columns
smiles_column = df["Smiles"]
test = smiles_column[0]
column_names

Index(['Molecule ChEMBL ID', 'Molecular Weight', '#RO5 Violations', 'AlogP',
       'Compound Key', 'Smiles', 'Standard Type', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
       'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Action Type',
       'InChI', 'Potency', 'Molecular Weight_standardized',
       'AlogP_standardized', 'Standard Value_standardized',
       'pChEMBL Value_standardized', 'Ligand Efficiency LE_standardized',
       'Ligand Efficiency LLE_standardized',
       'Ligand Efficiency SEI_standardized'],
      dtype='object')

In [48]:
def calculate_topological_descriptors(smiles):
    m = Chem.MolFromSmiles(smiles)
    dict = {}
    #Trying to use a dictionary
    
    dict["BalabanJ"] = Descriptors.BalabanJ(m)
    dict["TPSA"] = Descriptors.TPSA(m)
    dict["Ipc"] = Descriptors.Ipc(m)
    dict["HallKierAlpha"] = Descriptors.HallKierAlpha(m)
    dict["Kappa1"] = Descriptors.Kappa1(m)
    dict["Kappa2"] = Descriptors.Kappa2(m)
    dict["Kappa3"] = Descriptors.Kappa3(m)
    dict["Chi0"] = Descriptors.Chi0(m)
    dict["Chi1"] = Descriptors.Chi1(m)
    dict["Chi0n"] = Descriptors.Chi0n(m)
    dict["Chi1n"] = Descriptors.Chi1n(m)
    dict["Chi2n"] = Descriptors.Chi2n(m)
    dict["Chi3n"] = Descriptors.Chi3n(m)
    dict["Chi4n"] = Descriptors.Chi4n(m)
    dict["Chi0v"] = Descriptors.Chi0v(m)
    dict["Chi1v"] = Descriptors.Chi1v(m)
    dict["Chi2v"] = Descriptors.Chi2v(m)
    dict["Chi3v"] = Descriptors.Chi3v(m)
    dict["Chi4v"] = Descriptors.Chi4v(m)
    dict["PEOE_VSA1"] = Descriptors.PEOE_VSA1(m)
    dict["PEOE_VSA2"] = Descriptors.PEOE_VSA2(m)
    dict["PEOE_VSA3"] = Descriptors.PEOE_VSA3(m)
    dict["PEOE_VSA4"] = Descriptors.PEOE_VSA4(m)
    dict["PEOE_VSA5"] = Descriptors.PEOE_VSA5(m)
    dict["PEOE_VSA6"] = Descriptors.PEOE_VSA6(m)
    dict["PEOE_VSA7"] = Descriptors.PEOE_VSA7(m)
    dict["PEOE_VSA8"] = Descriptors.PEOE_VSA8(m)
    dict["PEOE_VSA9"] = Descriptors.PEOE_VSA9(m)
    dict["PEOE_VSA10"] = Descriptors.PEOE_VSA10(m)
    dict["PEOE_VSA11"] = Descriptors.PEOE_VSA11(m)
    dict["PEOE_VSA12"] = Descriptors.PEOE_VSA12(m)
    dict["PEOE_VSA13"] = Descriptors.PEOE_VSA13(m)
    dict["PEOE_VSA14"] = Descriptors.PEOE_VSA14(m)
    
    return dict

smiles_list = df["Smiles"].tolist()
topological_dict = [calculate_topological_descriptors(x) for x in smiles_list]
pd.DataFrame.from_dict(topological_dict)



Unnamed: 0,BalabanJ,TPSA,Ipc,HallKierAlpha,Kappa1,Kappa2,Kappa3,Chi0,Chi1,Chi0n,...,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14
0,1.690515,56.13,1.039014e+07,-3.79,20.481470,8.149739,4.237594,22.009861,14.863523,16.356336,...,0.000000,42.464569,53.443011,12.232143,23.237965,0.000000,0.000000,0.000000,0.00000,12.207413
1,2.023307,32.34,2.699429e+06,-2.78,21.917604,7.806619,4.364645,22.267220,13.891884,15.611579,...,0.000000,30.331835,42.167647,6.544756,11.605292,11.859062,0.000000,0.000000,0.00000,18.383712
2,1.612535,50.27,3.098475e+07,-3.04,21.787307,7.495135,3.954292,23.432511,15.557210,17.277205,...,0.000000,29.775636,55.009291,6.420822,22.160304,11.312736,5.824404,0.000000,0.00000,12.352597
3,1.392979,58.64,4.780479e+06,-2.74,20.529456,7.501947,3.961512,21.518297,14.167252,15.760802,...,0.000000,17.667307,36.398202,31.618542,22.570358,11.350563,5.783245,0.000000,0.00000,12.207413
4,2.046188,63.40,1.415232e+06,-3.40,18.392847,8.319859,4.416427,19.225404,13.058551,15.208421,...,0.000000,78.359856,30.183374,12.108208,0.000000,6.041841,0.000000,5.907180,5.90718,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,1.551374,57.26,7.802733e+05,-2.29,18.499572,7.527031,4.456538,18.733840,12.325518,13.554656,...,0.000000,12.132734,46.616234,29.800917,11.250838,5.817863,0.000000,0.000000,0.00000,12.207413
650,2.335188,59.42,1.494366e+04,-1.59,12.856309,5.370257,2.626550,13.120956,8.613392,9.513644,...,11.336786,23.733674,19.056471,10.586085,6.606882,5.007624,4.877147,5.879988,0.00000,5.969305
651,2.020584,102.50,4.517308e+04,-2.42,11.658106,4.687837,2.147235,13.242276,9.275188,9.925115,...,21.534149,0.000000,12.137122,12.393687,12.741600,15.578699,0.000000,11.704393,0.00000,0.000000
652,2.332605,50.19,2.005623e+04,-1.75,13.676346,5.484467,3.172661,13.991199,8.969234,10.682746,...,11.336786,25.980209,18.050640,11.984273,0.000000,15.701992,5.783245,5.879988,0.00000,0.000000


In [27]:
def calculate_balabanj(smiles):
    m = Chem.MolFromSmiles(smiles)
    return Descriptors.BalabanJ(m)

smiles_list = df["Smiles"].tolist()
balabanj_column = [calculate_balabanj(x) for x in smiles_list]
balabanj_column

[1.6905150975450596,
 2.0233069412096425,
 1.6125353878401474,
 1.392979393447132,
 2.0461878689391466,
 2.15770453831901,
 1.4539291429331016,
 1.467129963154029,
 1.39014536550327,
 1.59609170008441,
 1.591346714550777,
 1.591346714550777,
 1.5971632377073288,
 1.5788006659029055,
 1.619493819673559,
 1.8085024044233937,
 1.7907614407296388,
 1.566204224670086,
 1.586829504564997,
 1.6158845462044182,
 1.659760156751071,
 1.8305026049166073,
 1.8296687951888386,
 1.7923320979674997,
 1.6973253798956067,
 2.576000355142546,
 2.1458635151056513,
 1.5938292095468638,
 1.5936597409515423,
 1.6352480035238133,
 2.140253910412202,
 2.1458635151056513,
 2.437159260034696,
 1.5701325260942531,
 2.1645382342586394,
 2.1093207363941167,
 2.0969223460866084,
 2.0152642029082966,
 1.7071023321182226,
 1.7941582472190978,
 1.8223205641589661,
 1.5402589622370515,
 1.7626275290697762,
 1.6150255918233778,
 1.7683752502242274,
 1.7540063767544296,
 1.8063642118728755,
 1.6073251949816119,
 1.962962