In [None]:
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors
from sklearn.ensemble import RandomForestClassifier
import numpy

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load assay info. Note: This CSV was obtained from PubChem bioassay, via searching for AID 1030 
# and downloading the datatable

ba_df = pd.read_csv("AID_1030_datatable_all.csv")

# Load compound info
cs_df = pd.read_csv("AID_1030_compound_smiles.csv",sep='\t',header=0)

# Merge the two
full_df = ba_df.merge(cs_df,on='PUBCHEM_CID')

# Cleanup the compound ID column
full_df["PUBCHEM_CID"] = full_df["PUBCHEM_CID"].astype(int)

compounds = list()
fingerprints = list()
activities = list()

for index, row in full_df.iterrows() :
    cid = row["PUBCHEM_CID"]
    smiles = row["Smiles"]
    mol = Chem.MolFromSmiles(smiles)
    is_active = row["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"
    if mol is not None: 
        fingerprint = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,2,nBits=2048,useChirality=False,
                                                                     useBondTypes=False,useFeatures=False)
        
        # From RDKit documentation
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        fingerprint = arr
        
        compounds.append(cid)
        fingerprints.append(fingerprint)
        activities.append(is_active)
    
    if index % 10000 == 0:
        print("Processed index: {0}".format(index))

print("Processed all, pickling")

compounds_and_features = (compounds, fingerprints, activities)

# Pickle the data to save time in the future
with open('data.pickle', 'wb') as f:
    pickle.dump(compounds_and_features, f, pickle.HIGHEST_PROTOCOL)

Processed index: 0
Processed index: 10000
Processed index: 20000
Processed index: 30000
Processed index: 40000
Processed index: 50000
Processed index: 60000
Processed index: 70000
Processed index: 80000
Processed index: 90000
Processed index: 100000
Processed index: 110000
Processed index: 120000
Processed index: 130000
Processed index: 140000
Processed index: 150000
Processed index: 160000
Processed index: 170000
Processed index: 180000
Processed index: 190000
Processed index: 200000
Processed index: 210000
Processed index: 220000
Processed all, pickling

In [None]:
# Duplicate imports in case starting from this cell

import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors

# Pickle the data to speed up future analysis
compounds = None
fingerprints = None
activities = None

with open('data.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    (compounds, fingerprints, activities) = pickle.load(f)

# Print the number of compounds successfully featurized

print(len(compounds))
print(len(fingerprints))
    
print(compounds[0])
print(fingerprints[0])
print(activities[0])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(fingerprints, activities, test_size=0.33, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train,y_train)

score = rf.score(X_test,y_test)

print("Random forest obtained a score of: {0}".format(score))