In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, rdmolops, AllChem, DataStructs, Draw
import pandas as pd
import os
import subprocess

In [17]:
def readCSV(csv_file):
    '''
    Function which reads data from
    csv file and return pandas dataframe
    with ChEMBL ID, Smiles, Name and
    #RO5 Violations
    '''
    df = pd.read_csv(csv_file)
    return df

def count_descriptors(s):
    '''
    Function which calculates descriptors for
    each particle. It takes smiles and returns
    dictionary with molecule as key and descriptors
    as values
    '''
    
    smiles = Chem.MolFromSmiles(s)
    smiles = Chem.AddHs(smiles)
    dictionary = {}
    dictionary["mol_wt"] = Chem.Descriptors.MolWt(smiles)#"{:.2f}".format(Chem.Descriptors.MolWt(smiles))
    dictionary["log_p"] = Descriptors.MolLogP(smiles)#"{:.2f}".format(Descriptors.MolLogP(smiles))
    dictionary["HBD"] = rdMolDescriptors.CalcNumHBD(smiles)
    dictionary["HBA"] = rdMolDescriptors.CalcNumHBA(smiles)
    
    return dictionary

def create_table_descriptors(csv_file):
    '''
    Function which creates table which sums descriptors
    As an input it takes file with smiles and returns
    pandas dataframe with CHEMBL Id, smiles and descriptors
    '''
    
    results = readCSV(csv_file)

    smiles_tmp = list(results["SMILES"])
    pubchem_tmp = list(results["CID"])
    prob_tmp = list(results["probability"])

    mol_wt = []
    log_p = []
    HBD = []
    HBA = []
    smiles = []
    pubchem = []
    prob = []

    for smile in smiles_tmp:
        counter = count_descriptors(smile)
        if counter["mol_wt"] < 500 and counter["log_p"] < 5 and counter["HBD"] < 5 and counter["HBA"] < 10:
            mol_wt.append(counter["mol_wt"])
            log_p.append(counter["log_p"])
            HBD.append(counter["HBD"])
            HBA.append(counter["HBA"])
            smiles.append(smile)
            idx = smiles_tmp.index(smile)
            pubchem.append(pubchem_tmp[idx])
            prob.append(prob_tmp[idx])

    data = {}
    data["Smiles"] = smiles
    data["Mol wt"] = mol_wt
    data["Log_p"] = log_p
    data["HBD"] = HBD
    data["HBA"] = HBA
    data["Pubchem ID"] = pubchem
    data["Probability"] = prob
    
    df = pd.DataFrame(data)
    
    return df

In [18]:
results = create_table_descriptors("./sorted-results-filtered-cid.csv")

print(results.head())

                                              Smiles   Mol wt    Log_p  HBD  \
0  CC#CCOC1=CC=C(C=C1)S(=O)(=O)N2CCCN(CC2C(=O)NO)...  471.535  1.49950    2   
1  CC#CCOC1=CC=C(C=C1)S(=O)(=O)N2CCCN(CC2C(=O)NO)...  457.552  1.85930    2   
2  CC#CCOC1=CC=C(C=C1)S(=O)(=O)N2CC3C(C(C2C(=O)NO...  454.501  0.50210    2   
3  CC(C)CC1C(OCCCCN2C=C(CC(NC1=O)C(=O)NC)C3=CC=CC...  458.559  1.76130    4   
4  CC(C)C(CC1=CC(=C(C=C1)OC)OCCCOC)CC(C(CC(C)C(=O...  491.673  3.44968    3   

   HBA  Pubchem ID  Probability  
0    6    10183755     0.999986  
1    6    44390323     0.999986  
2    8    44286663     0.999984  
3    6    11059553     0.999983  
4    7    10345514     0.999983  


In [19]:
print(len(results))

333736


In [20]:
results.to_csv("./results-lipinski.csv")