In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
import gensim
from gensim import models
import time
import pickle
import json
import urllib
import requests
import os
import gzip
import zipfile
from bisect import bisect_left

global_random_state = 42
k_fold_splits = 2

# Source: https://stackoverflow.com/questions/212358/binary-search-bisection-in-python/212971
def binary_search(a, x, lo=0, hi=None):  # can't use a to specify default for hi
    hi = hi if hi is not None else len(a)  # hi defaults to len(a)
    pos = bisect_left(a, x, lo, hi)  # find insertion position
    return (pos if pos != hi and a[pos] == x else -1)  # don't walk off the end


In [2]:
print("Loading from pickle")
with open('/media/data/pubchem/kekulesmiles_tuple.pickle',"rb") as f:
    cid_keys, smile_values = pickle.load(f)
print("Data loaded")

Loading from pickle
Data loaded


In [4]:
assay_dir = "/media/data/pubchem/Data"

def build_single_assay_dataset(assay_num) :
    
    print("Processing assay: {0}".format(assay_num))
    # Round assay into down to nearest thousand
    assay_num_rounded_lower = assay_num - (assay_num % 1000) + 1
    assay_num_rounded_upper = assay_num_rounded_lower + 999
    expected_folder_name = "{0:0>7}_{1:0>7}".format(assay_num_rounded_lower,assay_num_rounded_upper)
    expected_name = "{0}.zip".format(expected_folder_name)
    expected_path = os.path.join(assay_dir,expected_name)

    archive = zipfile.ZipFile(expected_path, 'r')

    fingerprints = list()
    activities = list()
    mols = list()
    num_parsed = 0
    
    with archive.open(expected_folder_name + '/' + str(assay_num) + ".csv.gz") as f:
        with gzip.open(f) as g:

            df = pd.read_csv(g)
            df = df.dropna(subset=["PUBCHEM_ACTIVITY_OUTCOME","PUBCHEM_CID"])
            df["PUBCHEM_CID"] = df["PUBCHEM_CID"].astype(int)
            df["IS_ACTIVE"] = df["PUBCHEM_ACTIVITY_OUTCOME"].apply(lambda x: True if "Active" in x else False)

            df_active = df[df["IS_ACTIVE"] == True]
            df_inactive = df[df["IS_ACTIVE"] == False]
            num_active = len(df_active)
            num_inactive = len(df_inactive)
            #print("Active are: {}, Inactive are: {}".format(num_active,num_inactive))

            active_added_count = 0
            inactive_added_count = 0
            num_parsed = 0

            for index, row in df.iterrows() :

                is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"

                if not is_active and inactive_added_count >= num_active:
                    continue

                cid = int(row["PUBCHEM_CID"])
                cid_pos = binary_search(cid_keys,cid)

                if cid_pos == -1:
                    continue

                smiles_string = smile_values[cid_pos]
                mol = Chem.MolFromSmiles(smiles_string)

                if mol is None:
                    continue

                fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                 useBondTypes=False,useFeatures=False)
                # From RDKit documentation
                arr = np.zeros((1,))
                DataStructs.ConvertToNumpyArray(fingerprint, arr)
                fingerprint = arr

                fingerprints.append(fingerprint)                
                activities.append(is_active)
                mols.append(mol)

                num_parsed = num_parsed + 1

                if is_active:
                    active_added_count = active_added_count + 1
                else :
                    inactive_added_count = inactive_added_count + 1


    X = fingerprints
    y = activities
    return X, y
    

def build_multi_assay_dataset(gene_symbol,max_num_assays,holdout_assay_num=None) :

    # Use an API call to find all related bioassays
    assays_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/genesymbol/{0}/aids/TXT".format(gene_symbol)
    r = requests.get(assays_url)
    relevant_assays = [int(x) for x in r.text.split('\n') if len(x) > 0]
    print("Found a total of: {0} assays linked to Gene Symbol {1}".format(len(relevant_assays),gene_symbol))

    # Each assay n is in a file starting from 0001 to 1000, etc

    fingerprints = list()
    activities = list()

    i = 0
    assays_used = list()

    for assay_num in relevant_assays:

        if holdout_assay_num and assay_num == holdout_assay_num:
            continue
        
        if i >= max_num_assays:
            break
            
        fp, ac = build_single_assay_dataset(assay_num)
        
        fingerprints.extend(fp)
        activities.extend(ac)
        
        assays_used.append(assay_num)

        i = i + 1
        
        
    return np.array(fingerprints), np.array(activities), relevant_assays, assays_used

def score_transfer_learning(X, y, X_holdout, y_holdout):
    classifier = DummyClassifier(random_state=global_random_state)
    classifier.fit(X,y)
    start = time.time()
    y_pred = classifier.predict(X_holdout)
    elapsed = time.time() - start
    auc = roc_auc_score(y_holdout, y_pred, average='macro', sample_weight=None)
    print("roc_auc score of {0} with DummyClassifier".format(auc))
    
    classifier = RandomForestClassifier(random_state=global_random_state)
    classifier.fit(X,y)
    start = time.time()
    y_pred = classifier.predict(X_holdout)
    elapsed = time.time() - start
    auc = roc_auc_score(y_holdout, y_pred, average='macro', sample_weight=None)

    print("roc_auc score of {0} with RandomForestClassifier".format(auc))
    

X, y, relevant_assays, assays_used = build_multi_assay_dataset("PPARG",5,1032)

#print("Core dataset size is: {0}".format(len(X)))
X_holdout, y_holdout = build_single_assay_dataset(1032)
#print("Comparison dataset size is: {0}".format(len(X_holdout)))

X_holdout = np.array(X_holdout)
y_holdout = np.array(y_holdout)

score_transfer_learning(X,y,X_holdout,y_holdout)


Found a total of: 912 assays linked to Gene Symbol PPARG
Processing assay: 631


  exec(code_obj, self.user_global_ns, self.user_ns)


Processing assay: 731
Processing assay: 1048
Processing assay: 1049
Processing assay: 1051
Processing assay: 1032


  if self.run_code(code, result):


roc_auc score of 0.5006612974730309 with DummyClassifier
roc_auc score of 0.9344059405940595 with RandomForestClassifier


In [6]:
def score_transfer_learning_for_gene(gene_symbol, ref_assay_size) :
    X, y, relevant_assays, assays_used = build_multi_assay_dataset(gene_symbol,ref_assay_size)
    
    # Find a workable holdoutassay
    X_holdout = None
    y_holdout = None
    i = 0
    while(X_holdout is None or len(X_holdout) < 100 and i < len(relevant_assays) - ref_assay_size - 1) :
        holdout_assay = relevant_assays[ref_assay_size + i]
        X_holdout, y_holdout = build_single_assay_dataset(holdout_assay)
        i = i + 1
    
    print("{0},{1},{2},{3},{4}".format(len(X),len(y),holdout_assay,len(X_holdout),len(y_holdout)))
    print("Scoring transferability for gene: {0}".format(gene_symbol))
    score_transfer_learning(X,y,X_holdout,y_holdout)

# The 10 most studied genes from https://www.nature.com/articles/d41586-017-07291-9 
gene_list = ["TP53","TNF","EGFR","VEGFA","APOE","IL6","TGFB1","MTHFR","ESR1","AKT1"]
for gene_symbol in gene_list:
    score_transfer_learning_for_gene(gene_symbol,5)

Found a total of: 176 assays linked to Gene Symbol TP53
Processing assay: 902




Processing assay: 903




Processing assay: 904
Processing assay: 924
Processing assay: 238133
Processing assay: 241436
Processing assay: 241501
Processing assay: 241830
Processing assay: 243263
Processing assay: 271647
Processing assay: 271651
Processing assay: 284164
Processing assay: 284165
Processing assay: 370665
Processing assay: 438428
Processing assay: 438429
Processing assay: 438431
Processing assay: 438435
Processing assay: 442448
Processing assay: 442449
Processing assay: 442450
Processing assay: 442451
Processing assay: 442452
Processing assay: 442453
Processing assay: 442454
Processing assay: 442455
Processing assay: 442456
Processing assay: 442457
Processing assay: 442458
Processing assay: 442459
Processing assay: 442460
Processing assay: 442461
Processing assay: 442462
Processing assay: 442463
Processing assay: 442464
Processing assay: 503473
Processing assay: 504706


  exec(code_obj, self.user_global_ns, self.user_ns)


6014,6014,504706,354,354
Scoring transferability for gene: TP53
roc_auc score of 0.5024875621890547 with DummyClassifier
roc_auc score of 0.7328065554580042 with RandomForestClassifier
Found a total of: 47 assays linked to Gene Symbol TNF
Processing assay: 1852
Processing assay: 2337
Processing assay: 2483
Processing assay: 2485
Processing assay: 2801
Processing assay: 210150
Processing assay: 215455
Processing assay: 215456
Processing assay: 215457
Processing assay: 215589
Processing assay: 247750
Processing assay: 461440
Processing assay: 526325
Processing assay: 651757
Processing assay: 732923
Processing assay: 732924
Processing assay: 733374
Processing assay: 733375
Processing assay: 733376
Processing assay: 733377
Processing assay: 733378
Processing assay: 733379
Processing assay: 733380
Processing assay: 733381
Processing assay: 733382
Processing assay: 733383
Processing assay: 733384
Processing assay: 736485
Processing assay: 742688
Processing assay: 1156798
Processing assay: 11

IndexError: list index out of range