In [35]:
import sqlite3
import pandas as pd
import pickle
import timeit
from bioservices import ChEMBL
#BioServices Documentation
#http://bioservices.readthedocs.io/en/master/references.html#module-bioservices.chembl

In [36]:
#Download chembl for sqlite via the ftp
conn = sqlite3.connect("'/ihome/gidakwo/ml_files/SMILES/chembl_23/chembl_23_sqlite/chembl_23.db")
cur = conn.cursor()
cur.execute('''SELECT * FROM assays;''')
all_assays = cur.fetchall()
print(len(all_assays))

1302147


In [37]:
df = pd.read_sql_query("SELECT * FROM assays;", conn)
print('Shape of Entire Assays Table: ', df.shape)

df_B = df.loc[df['assay_type'] == 'B']
print('Shape of Assay Subset of Type Binding (B): ', df_B.shape) 

#This table does not have the number of compounds/activities for each assay, which is required for ranking
#To do this, Bioservices was used to get details by ChemID

Shape of Arrays Table:  (1302147, 23)
Shape of AssyType B:  (272421, 23)


In [38]:
s = ChEMBL(verbose=False)

dictAssay_B = s.get_assays_by_chemblId(df_B['chembl_id'])

with open("dictAssay_B.pkl","wb") as f:
    pickle.dump(dictAssay_B, f)
    
listAssay_B = []
for item in dictAssay_B:
    listAssay_B.append(item['assay'])

751.6798039558344


In [51]:
B_numB = pd.DataFrame(listAssay_B, dtype=object).fillna('').sort_values('numBioactivities', ascending = False)
B_numB = B_numB[B_numB['assayOrganism'] == 'Homo sapiens']
B_numB = B_numB.iloc[:20,:]

In [52]:
B_numB

Unnamed: 0,assayDescription,assayOrganism,assayStrain,assayType,chemblId,journal,numBioactivities
140672,PUBCHEM_BIOASSAY: qHTS for Inhibitors of Tau F...,Homo sapiens,Unspecified,B,CHEMBL1614421,,49809
140669,PUBCHEM_BIOASSAY: qHTS Assay for Inhibitors of...,Homo sapiens,Unspecified,B,CHEMBL1614364,,12273
147222,PUBCHEM_BIOASSAY: qHTS Assay for Compounds Blo...,Homo sapiens,Unspecified,B,CHEMBL1613933,,7272
157937,DRUGMATRIX: Thromboxane Synthetase enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909116,,1742
157944,DRUGMATRIX: Carbonic Anhydrase II enzyme inhib...,Homo sapiens,Unspecified,B,CHEMBL1909123,,1742
157951,DRUGMATRIX: Cyclooxygenase COX-1 enzyme inhibi...,Homo sapiens,Unspecified,B,CHEMBL1909130,,1742
157876,DRUGMATRIX: Platelet Activating Factor (PAF) r...,Homo sapiens,Unspecified,B,CHEMBL1909187,,1742
157875,DRUGMATRIX: Phosphodiesterase PDE5 enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909186,,1742
157873,DRUGMATRIX: Phosphodiesterase PDE3 enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909184,,1742
157883,"DRUGMATRIX: Protease, Cathepsin G enzyme inhib...",Homo sapiens,Unspecified,B,CHEMBL1909194,,1742


In [None]:
from rdkit import Chem
import molvs as mv

#Function to get parent of a smiles
#Source: https://www.wildcardconsulting.dk/useful-information/a-deep-tox21-neural-network-with-rdkit-and-keras/
def parent(smiles):
    st = mv.Standardizer() #MolVS standardizer
    try:
        mols = st.charge_parent(Chem.MolFromSmiles(smiles))
        return Chem.MolToSmiles(mols)
    except:
        return "NaN"

#Clean and standardize the data
def clean_data(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(parent)
    data = data[~(data['smiles_parent'] == "NaN")]
    
    #Filter small fragents away
    def NumAtoms(smile):
        return Chem.MolFromSmiles(smile).GetNumAtoms()
    
    data["NumAtoms"] = data["smiles_parent"].apply(NumAtoms)
    data = data[data["NumAtoms"] > 3]
    return data

In [91]:
mergedChEMBLDF = pd.DataFrame()

for i in B_numB['chemblId']: #For every chemID in the shortlisted 20
    b = s.get_assays_bioactivities(i)
    b = b['bioactivities'] #Nested Dictionary (Outputs of the Bioservices package are usually json format)
    df__ = pd.DataFrame(b) #This data frame has ChemID of the compounds in the BioAssay, no SMILES
    
    #To get the SMILES and InChiKey for each compound:
    c = s.get_compounds_by_chemblId(df__['ingredient_cmpd_chemblid'])
    df_c = pd.DataFrame([i['compound'] for i in c]) #Another nested dictionary
    
    assay_data = pd.concat([df__, df_c], axis = 1) #Merge bioactivity and compound dataframes
    assay_data = assay_data[['chemblId','stdInChiKey', 'value', 'operator', 'smiles']] #Select only relevant columns
    assay_data = clean_data(assay_data) #Generate charged parent
    assay_data.to_csv('/ihome/gidakwo/ml_files/SMILES/' + i + '.csv', index=False)

In [42]:
#https://github.com/deepchem/deepchem/tree/master/deepchem/molnet/load_function

In [94]:
assay_data.columns

Index(['activity_comment', 'assay_chemblid', 'assay_description', 'assay_type',
       'bioactivity_type', 'ingredient_cmpd_chemblid', 'name_in_reference',
       'operator', 'organism', 'parent_cmpd_chemblid', 'reference',
       'target_chemblid', 'target_confidence', 'target_name', 'units', 'value',
       'acdAcidicPka', 'acdBasicPka', 'acdLogd', 'acdLogp', 'alogp',
       'chemblId', 'knownDrug', 'molecularFormula', 'molecularWeight',
       'numRo5Violations', 'passesRuleOfThree', 'rotatableBonds', 'smiles',
       'species', 'stdInChiKey'],
      dtype='object')

In [97]:
import molvs as mv