In [22]:
import sqlite3
import pandas as pd
import pickle
import timeit
from bioservices import ChEMBL
#BioServices Documentation
#http://bioservices.readthedocs.io/en/master/references.html#module-bioservices.chembl

In [23]:
import time
from datetime import timedelta
start = time.time()

In [27]:
conn = sqlite3.connect("/ihome/gidakwo/ml_files/SMILES/chembl_23/chembl_23_sqlite/chembl_23.db")
query = """SELECT count(*), A.assay_id, B.chembl_id, B.assay_organism, B.description, B.assay_type
    FROM activities as A
    LEFT JOIN assays AS B ON B.assay_id = A.assay_id
    Where B.assay_type= "B"
    
    group by A.assay_id;"""

#group_by activities.assay_id left join assays on activities.assay_id = assays.assay_id ;"

Assay_info = pd.read_sql_query(query, conn).sort_values("count(*)", ascending = False)
B_numB = Assay_info.iloc[:20,:]

In [28]:
B_numB

Unnamed: 0,assayDescription,assayOrganism,assayStrain,assayType,chemblId,journal,numBioactivities
140672,PUBCHEM_BIOASSAY: qHTS for Inhibitors of Tau F...,Homo sapiens,Unspecified,B,CHEMBL1614421,,49809
140669,PUBCHEM_BIOASSAY: qHTS Assay for Inhibitors of...,Homo sapiens,Unspecified,B,CHEMBL1614364,,12273
147222,PUBCHEM_BIOASSAY: qHTS Assay for Compounds Blo...,Homo sapiens,Unspecified,B,CHEMBL1613933,,7272
157937,DRUGMATRIX: Thromboxane Synthetase enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909116,,1742
157944,DRUGMATRIX: Carbonic Anhydrase II enzyme inhib...,Homo sapiens,Unspecified,B,CHEMBL1909123,,1742
157951,DRUGMATRIX: Cyclooxygenase COX-1 enzyme inhibi...,Homo sapiens,Unspecified,B,CHEMBL1909130,,1742
157876,DRUGMATRIX: Platelet Activating Factor (PAF) r...,Homo sapiens,Unspecified,B,CHEMBL1909187,,1742
157875,DRUGMATRIX: Phosphodiesterase PDE5 enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909186,,1742
157873,DRUGMATRIX: Phosphodiesterase PDE3 enzyme inhi...,Homo sapiens,Unspecified,B,CHEMBL1909184,,1742
157883,"DRUGMATRIX: Protease, Cathepsin G enzyme inhib...",Homo sapiens,Unspecified,B,CHEMBL1909194,,1742


In [29]:
from rdkit import Chem
import molvs as mv

#Function to get parent of a smiles
#Source: https://www.wildcardconsulting.dk/useful-information/a-deep-tox21-neural-network-with-rdkit-and-keras/
def parent(smiles):
    st = mv.Standardizer() #MolVS standardizer
    try:
        mols = st.charge_parent(Chem.MolFromSmiles(smiles))
        return Chem.MolToSmiles(mols)
    except:
        return "NaN"

#Clean and standardize the data
def clean_data(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(parent)
    data = data[~(data['smiles_parent'] == "NaN")]
    
    #Filter small fragents away
    def NumAtoms(smile):
        return Chem.MolFromSmiles(smile).GetNumAtoms()
    
    data["NumAtoms"] = data["smiles_parent"].apply(NumAtoms)
    data = data[data["NumAtoms"] > 3]
    return data

In [30]:
mergedChEMBLDF = pd.DataFrame()
keep_cols = ['chemblId','stdInChiKey', 'value', 'operator', 'smiles']

for i in B_numB['chemblId']: #For every chemID in the shortlisted 20
    b = s.get_assays_bioactivities(i)
    b = b['bioactivities'] #Nested Dictionary (Outputs of the Bioservices package are usually json format)
    df__ = pd.DataFrame(b) #This data frame has ChemID of the compounds in the BioAssay, no SMILES
    
    #To get the SMILES and InChiKey for each compound:
    c = s.get_compounds_by_chemblId(df__['ingredient_cmpd_chemblid'])
    df_c = pd.DataFrame([i['compound'] for i in c]) #Another nested dictionary
    
    assay_data = pd.concat([df__, df_c], axis = 1) #Merge bioactivity and compound dataframes
    assay_data_ = assay_data.loc[:, keep_cols] #Select only relevant columns
    assay_data_ = clean_data(assay_data_) #Generate charged parent
    
    assay_data_ = assay_data_.drop(['NumAtoms', 'chemblId'], axis=1)
    assay_id = i[6:]
    assay_data_.columns = ['stdInChiKey', assay_id + '_value', assay_id + '_operator', 'smiles', 'smiles_parent']
    mergedChEMBLDF = pd.concat([mergedChEMBLDF, assay_data_]).groupby('stdInChiKey', as_index=False, sort=False).first().fillna('NA')
    
mergedChEMBLDF.to_csv('/ihome/gidakwo/ml_files/SMILES/mergedChEMBLDF.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [31]:
mergedChEMBLDF.head()

Unnamed: 0,stdInChiKey,1613933_operator,1613933_value,1614063_operator,1614063_value,1614364_operator,1614364_value,1614421_operator,1614421_value,1794557_operator,...,3705123_operator,3705123_value,3705362_operator,3705362_value,3705488_operator,3705488_value,3706373_operator,3706373_value,smiles,smiles_parent
0,RJYFVJPEXKPGNZ-UHFFFAOYSA-N,,,,,,,=,1412.5,,...,,,,,,,,,COc1ccccc1C2CCN(CC2)C(=Nc3ccc(Br)cc3)S,COc1ccccc1C1CCN(C(S)=Nc2ccc(Br)cc2)CC1
1,GNWGGHKGEXNNMK-UHFFFAOYSA-N,,,,,,,=,17782.8,,...,,,,,,,,,Cc1ccc(O)c(NC(=O)c2onc(c2)c3ccc(F)cc3)c1,Cc1ccc(O)c(NC(=O)c2cc(-c3ccc(F)cc3)no2)c1
2,LVDHDRPSCLIFHH-RDRPBHBLSA-N,,,,,=,5011.9,=,10000.0,,...,,,,,,,,,COc1ccccc1NC(=O)C(=O)N\N=C(/C)\CC(=O)Nc2ccc(OC...,COc1ccccc1NC(=O)C(=O)NN=C(C)CC(=O)Nc1ccc(OCc2c...
3,OJENVBWGZAMKEC-UHFFFAOYSA-N,,,,,,,=,707.9,,...,,,,,,,,,CC(=O)c1ccc(NC(=O)c2oc(cc2)c3ccc(Cl)cc3)cc1,CC(=O)c1ccc(NC(=O)c2ccc(-c3ccc(Cl)cc3)o2)cc1
4,HGUWAIXVEMNODZ-UHFFFAOYSA-N,,,,,,,=,19952.6,,...,,,,,,,,,Cc1nc2ncnn2c(N3CCN(CC3)c4cccc(c4)C(F)(F)F)c1C,Cc1nc2ncnn2c(N2CCN(c3cccc(C(F)(F)F)c3)CC2)c1C


In [32]:
mergedChEMBLDF.shape

(65382, 43)

In [34]:
elapsed = (time.time() - start)
print(str(timedelta(seconds=elapsed)))

0:27:20.731797
