In [27]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
import gensim
from gensim import models
import time
import pickle
import json
import urllib
import requests
import os
import gzip

import zipfile


global_random_state = 42
k_fold_splits = 2

In [None]:
gene_symbol = "PPARG"
# Use an API call to find all related bioassays. For now, use the assays related to PPAR gamma
assays_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol/{0}/aids/TXT".format(gene_symbol)
r = requests.get(assays_url)
relevant_assays = [int(x) for x in r.text.split('\n') if len(x) > 0]
print(len(relevant_assays))

assay_dir = "/media/data/pubchem/Data"

# Each assay n is in a file starting from 0001 to 1000, etc

assay_paths = list()

for assay_num in relevant_assays:
    print(assay_num)
    # Round assay into down to nearest thousand
    assay_num_rounded_lower = assay_num - (assay_num % 1000) + 1
    assay_num_rounded_upper = assay_num_rounded_lower + 999
    expected_folder_name = "{0:0>7}_{1:0>7}".format(assay_num_rounded_lower,assay_num_rounded_upper)
    expected_name = "{0}.zip".format(expected_folder_name)
    expected_path = os.path.join(assay_dir,expected_name)
    assay_paths.append(expected_path)
    archive = zipfile.ZipFile(expected_path, 'r')
    
    with archive.open(expected_folder_name + '/' + str(assay_num) + ".csv.gz") as f:
        with gzip.open(f) as g:
            for line in g:
                print(line)
                break


df = pd.read_csv(assay_paths[0])
print(df.head())


#aid = 1032
# Use an API call to find all related bioassays. For now, use the assays related to PPAR gamma
#assay_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{0}/description/JSON".format(aid)
#json.load(urllib.urlopen(assay_url))

912
631
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation_Primary at 8 uM\n'
731
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation at 8uM\n'
1032
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation\n'
1048
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ratio\n'
1049
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ratio\n'
1051
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ratio\n'
1297
b'PUBCHEM_RESULT_TAG,PUBCHEM_SI

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 data validity,EC50 binding domains\n'
142314
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,SEI,BEI,LE,LLE,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
155978
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,SEI,BEI,LE,LLE,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
155979
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
157264
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
157265
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Log EC50 activity comment,Log EC50 standard flag,Log EC50 qualifier,Log EC50 published value,Log EC50 standard value\n'
157269
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Log Ki activity comment,Log Ki standard flag,Lo

243396
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Efficacy activity comment,Efficacy standard flag,Efficacy qualifier,Efficacy published value,Efficacy standard value\n'
243412
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Efficacy activity comment,Efficacy standard flag,Efficacy qualifier,Efficacy published value,Efficacy standard value\n'
244311
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Fold activation activity comment,Fold activation standard flag,Fold activation qualifier,Fold activation published value,Fold activation standard value\n'
244312
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Fold activation a

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value\n'
260320
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,IC50,SEI,BEI,LE,LLE,IC50 activity comment,IC50 standard flag,IC50 qualifier,IC50 published value,IC50 standard value,IC50 binding domains\n'
260323
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
261945
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,IC50,SEI,BEI,LE,LLE,EC50 activity comment,EC50 stand

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value\n'
277007
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Efficacy activity comment,Efficacy standard flag,Efficacy qualifier,Efficacy published value,Efficacy standard value\n'
277014
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Selectivity ratio activity comment,Selectivity ratio standard flag,Selectivity ratio qualifier,Selectivity ratio published value,Selectivity ratio standard value\n'
277015
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Kd,SEI,BEI,LE,LLE,Kd activity comm

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
313755
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
313756
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
314545
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ki,SEI,BEI,LE,LLE,Ki activity comment,Ki standard fla

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
361864
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
361865
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
361866
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity q

410083
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
410086
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
412999
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
413007
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,SEI,BEI,LE,LLE,EC50 activity commen

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
457062
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
457748
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
459526
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qu

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
483125
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
485564
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ki,SEI,BEI,LE,LLE,Ki activity comment,Ki standard flag,Ki qualifier,Ki published value,Ki standard value,Ki binding domains\n'
489477
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity stand

b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
548202
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ratio IC50 activity comment,Ratio IC50 standard flag,Ratio IC50 qualifier,Ratio IC50 published value,Ratio IC50 standard value\n'
550057
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,FC activity comment,FC standard flag,FC qualifier,FC published value,FC standard value\n'
550058
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,SEI,BEI,LE,LLE,EC50 activity comment,EC50 standard flag,EC50 qualifi

590197
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
591222
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,IC50,SEI,BEI,LE,LLE,IC50 activity comment,IC50 standard flag,IC50 qualifier,IC50 published value,IC50 standard value,IC50 binding domains\n'
594553
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activity standard flag,Activity qualifier,Activity published value,Activity standard value\n'
595117
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity activity comment,Activ

637379
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Ratio EC50 activity comment,Ratio EC50 standard flag,Ratio EC50 qualifier,Ratio EC50 published value,Ratio EC50 standard value\n'
637686
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,EC50 activity comment,EC50 standard flag,EC50 qualifier,EC50 published value,EC50 standard value,EC50 binding domains\n'
637690
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,IC50,SEI,BEI,LE,LLE,IC50 activity comment,IC50 standard flag,IC50 qualifier,IC50 published value,IC50 standard value,IC50 binding domains\n'
638936
b'PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,EC50,SEI,BEI,LE

In [None]:
df = pd.read_csv("AID_1032_datatable_all.csv")
df.head()

In [3]:
# Merge in the SMILES info for all compound IDs
cs = pd.read_csv("1032_CID_SMILES_Mapping.csv",sep='\t',header=0)
df = df.merge(cs,on="PUBCHEM_CID")
df.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation,SMILES
0,1,842121.0,6603010.0,Inactive,6.0,,,6.82,CCOCCCNCC(=O)NC1=CC=C(C=C1)OC(F)(F)F.Cl
1,2,842122.0,6602570.0,Inactive,2.0,,,2.45,COCCN1C(=NN=N1)CN2CCC(CC2)CC3=CC=CC=C3.Cl
2,3,842123.0,6602620.0,Inactive,1.0,,,1.41,COCCN1C(=NN=N1)CN2CCC(CC2)(C3=CC(=CC=C3)C(F)(F...
3,4,842124.0,644371.0,Inactive,0.0,,,-5.09,C1CCCN(CC1)CC(=O)NCCC2=CC=C(C=C2)F.C(=O)(C(=O)O)O
4,5,842125.0,6603130.0,Inactive,0.0,,,-2.24,COC1=CC=C(C=C1)C(=O)C(C2=CC=CC=C2)N3CCOCC3.Cl


In [4]:
# Binarize the inactive/active column

df["IS_ACTIVE"] = df["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
df["IS_ACTIVE"].astype(bool)

df_active = df[df["IS_ACTIVE"] == True]
df_inactive = df[df["IS_ACTIVE"] == False]

df["IS_ACTIVE"].head()

0    False
1    False
2    False
3    False
4    False
Name: IS_ACTIVE, dtype: bool

In [5]:
# Plot histogram of active
%matplotlib inline

print("Active are: {}, Inactive are: {}".format(df_active.count()["PUBCHEM_CID"], df_inactive.count()["PUBCHEM_CID"]))

Active are: 670, Inactive are: 195584


In [19]:
# Now let's calculate fixed-length features which we need to train an ML algorithm

cids = list()
fingerprints = list()
activities = list()
mols = list()

num_parsed = 0
num_active = 0
num_inactive = 0

for index, row in df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles_string = row["SMILES"]
    mol = Chem.MolFromSmiles(smiles_string)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    
    class_size = 670
    
    if num_active == class_size and num_inactive == class_size :
        print("We have enough data")
        break
    
    if mol is None:
        print("Molecule failed featurization")
        print(index)
    else: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        
        
        if(is_active and num_active < class_size) :
            cids.append(cid)
            num_active = num_active + 1
            fingerprints.append(fingerprint)
            activities.append(is_active)
            mols.append(mol)
        elif (not is_active and num_inactive < class_size) :
            cids.append(cid)
            num_inactive = num_inactive + 1
            fingerprints.append(fingerprint)
            activities.append(is_active)
            mols.append(mol)
            
        num_parsed = num_parsed + 1
    
    if index % 10000 == 0:
        print("Processed index: {0}".format(index))
        print("Active are: {}, Inactive are: {}".format(num_active,num_inactive))

X = np.array(fingerprints)
y = np.array(activities)
        
fingerprints = np.array(fingerprints)
activities = np.array(activities,dtype=int)

rus = RandomUnderSampler(random_state=global_random_state)
X, y = rus.fit_sample(fingerprints, activities)

# Pickle the data to save time in the future
with open('data.classification.undersampled.1032.pickle', 'wb') as f:
    pickle.dump((X,y), f, pickle.HIGHEST_PROTOCOL)

Processed index: 0
Active are: 0, Inactive are: 1
Processed index: 10000
Active are: 16, Inactive are: 670
Processed index: 20000
Active are: 81, Inactive are: 670
Processed index: 30000
Active are: 105, Inactive are: 670
Processed index: 40000
Active are: 173, Inactive are: 670
Processed index: 50000
Active are: 204, Inactive are: 670
Processed index: 60000
Active are: 234, Inactive are: 670
Processed index: 70000
Active are: 269, Inactive are: 670
Processed index: 80000
Active are: 316, Inactive are: 670
Processed index: 90000
Active are: 368, Inactive are: 670
Processed index: 100000
Active are: 399, Inactive are: 670
Processed index: 110000
Active are: 431, Inactive are: 670
Processed index: 120000
Active are: 463, Inactive are: 670
Processed index: 130000
Active are: 497, Inactive are: 670
Processed index: 140000
Active are: 521, Inactive are: 670
Processed index: 150000
Active are: 555, Inactive are: 670
Processed index: 160000
Active are: 577, Inactive are: 670
Processed index: 

In [21]:
with open('data.classification.undersampled.1032.pickle','rb') as f:
    (X,y) = pickle.load(f)

In [24]:
# Note - minor code used from https://github.com/LRParser/pubchem-sklearn/blob/master/pubchem_bioassay_sklearn.ipynb
skf = StratifiedKFold(n_splits=k_fold_splits,shuffle=True,random_state=global_random_state)

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = DummyClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.5149253731343284
Average elapsed prediction time over 2 folds in s is: 0.0019965171813964844


In [25]:
roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.8223880597014925
Average elapsed prediction time over 2 folds in s is: 0.003376007080078125


In [29]:
# Let's check out training on embeddings

model = models.KeyedVectors.load_word2vec_format("vec.txt")
embeddings = list()

for mol in mols:
    try :
        info = {}
        rdMolDescriptors.GetMorganFingerprint(mol,0,bitInfo=info)
        keys = info.keys()
        keys_list = list(keys)
        totalvec = np.zeros(200)
        for k in keys_list:
            wordvec = model.wv[str(k)]
            totalvec = np.add(totalvec,wordvec)
        #print(totalvec)
        embeddings.append(totalvec)
    except Exception as e:
        print(e)
        pass
    
print(len(embeddings))
X = np.array(embeddings)



1340


In [30]:
# Now let's look at performance with embeddings, 70 MB

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.7291044776119403
Average elapsed prediction time over 2 folds in s is: 0.0015490055084228516


In [31]:
# Now let's look at performance with embeddings, 90 MB

model = models.KeyedVectors.load_word2vec_format("vec.txt")
embeddings = list()

for mol in mols:
    try :
        info = {}
        rdMolDescriptors.GetMorganFingerprint(mol,1,bitInfo=info)
        keys = info.keys()
        keys_list = list(keys)
        totalvec = np.zeros(200)
        for k in keys_list:
            wordvec = model.wv[str(k)]
            totalvec = np.add(totalvec,wordvec)
        #print(totalvec)
        embeddings.append(totalvec)
    except Exception as e:
        print(e)
        pass
    
print(len(embeddings))
X = np.array(embeddings)

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

1340
Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.7552238805970151
Average elapsed prediction time over 2 folds in s is: 0.00133514404296875


In [32]:
# Now let's look at performance with full embeddings

model = models.KeyedVectors.load_word2vec_format("vec.txt")
embeddings = list()

for mol in mols:
    try :
        info = {}
        rdMolDescriptors.GetMorganFingerprint(mol,1,bitInfo=info)
        keys = info.keys()
        keys_list = list(keys)
        totalvec = np.zeros(200)
        for k in keys_list:
            wordvec = model.wv[str(k)]
            totalvec = np.add(totalvec,wordvec)
        #print(totalvec)
        embeddings.append(totalvec)
    except Exception as e:
        print(e)
        pass
    
print(len(embeddings))
X = np.array(embeddings)

roc_auc_avg = 0
avg_predict_time = 0
for train_index, test_index in skf.split(X,y) :

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    print("Number of training samples is: {}".format(len(X_train)))
    print("Number of test samples is: {}".format(len(X_test)))

    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X_train,y_train)
    start = time.time()
    y_pred = classifier.predict(X_test)
    elapsed = time.time() - start
    avg_predict_time = avg_predict_time + elapsed
    auc = roc_auc_score(y_test, y_pred, average='macro', sample_weight=None)
    roc_auc_avg = roc_auc_avg + auc
    
avg_predict_time = avg_predict_time / k_fold_splits
roc_auc_avg = roc_auc_avg / k_fold_splits
print("Average roc_auc score of {} folds is: {}".format(k_fold_splits, roc_auc_avg))
print("Average elapsed prediction time over {} folds in s is: {}".format(k_fold_splits, avg_predict_time))

1340
Number of training samples is: 670
Number of test samples is: 670
Number of training samples is: 670
Number of test samples is: 670
Average roc_auc score of 2 folds is: 0.7417910447761195
Average elapsed prediction time over 2 folds in s is: 0.0013414621353149414
