##### Importing the required libraries

In [44]:
import pandas as pd

###### Loadig the bioactivity_preprocessed data

In [45]:
df=pd.read_csv('bioactivity_preprocessed_data3.csv')
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,inactive
...,...,...,...,...
991,CHEMBL1401639,COc1cc(C2C(C#N)=C(N)OC3=C2C(=O)CC(C)(C)C3)c([N...,100000.0,inactive
992,CHEMBL1448985,O=C(NCCc1ccccc1)c1ccc2nc(-c3ccco3)c(-c3ccco3)n...,59120.0,inactive
993,CHEMBL1519374,O=C1C(Cl)=C(Nc2ccc3c[nH]nc3c2)C(=O)c2ccccc21,18650.0,inactive
994,CHEMBL1540372,CC1(C)CC(=O)C(=Cc2cn(CCC#N)nc2-c2ccc(Cl)cc2)C(...,31810.0,inactive


#### Calculate Lipinski descriptors

###### Import libraries

In [46]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors,Lipinski

#### Calculate descriptors

In [60]:
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    return descriptors
    
   

In [61]:
df_lipinski=lipinski(df['canonical_smiles'])

In [62]:
df_lipinski

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,240.262,2.80020,0.0,4.0
1,268.224,2.44040,1.0,4.0
2,407.426,1.38560,1.0,5.0
3,306.277,3.84800,1.0,5.0
4,372.772,2.36698,0.0,8.0
...,...,...,...,...
991,399.403,3.06288,1.0,8.0
992,409.445,5.12230,1.0,5.0
993,323.739,3.50440,2.0,4.0
994,381.863,4.45878,0.0,5.0


In [63]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,inactive
...,...,...,...,...
991,CHEMBL1401639,COc1cc(C2C(C#N)=C(N)OC3=C2C(=O)CC(C)(C)C3)c([N...,100000.0,inactive
992,CHEMBL1448985,O=C(NCCc1ccccc1)c1ccc2nc(-c3ccco3)c(-c3ccco3)n...,59120.0,inactive
993,CHEMBL1519374,O=C1C(Cl)=C(Nc2ccc3c[nH]nc3c2)C(=O)c2ccccc21,18650.0,inactive
994,CHEMBL1540372,CC1(C)CC(=O)C(=Cc2cn(CCC#N)nc2-c2ccc(Cl)cc2)C(...,31810.0,inactive


In [None]:
df_combined=pd.concat([df,df_lipinski],axis=1)