The correlation between chemical structure and biological activity is called Structure Activity Relationshop (SAR) 
or Quantitive SAR (QSAR). In general similar compounds are known to exhibit similar biological activities, and it is very important in drug discovery research to understand this correlation and apply it to drug design.

In addition, there are two types of problems such as classification problems to estimate which class a compound belongs to, such as cell death or toxicity, and regression problems to estimate continuous values such as % inhibition


In [None]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier

from tensorflow.python.keras.layers import Iput
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.Model import Model

In [2]:
mols = []
labels = []
with open('/home/oohnohnoh1/Desktop/GIT/Chemiinformatics_work/py4chemoinformatics/notebooks/ch09_compounds.txt') as f:
    header = f.readline()
    smiles_index = -1
    for i, title in enumerate(header.split("\t")):
        if title == "CANONICAL_SMILES":
            smiles_index = i
        elif title == "STANDARD_VALUE":
            value_index = i
        print (i, title)    
    for l in f:
        ls = l.split("\t")
        mol = Chem.MolFromSmiles(ls[smiles_index])
        mols.append(mol)
        val = float(ls[value_index])
        if val < 1000:
            labels.append('POS')
        else:
            labels.append('NEG')
            
labels = np.array(labels)

0 CMPD_CHEMBLID
1 MOLREGNO
2 PARENT_CMPD_CHEMBLID
3 PARENT_MOLREGNO
4 MOL_PREF_NAME
5 COMPOUND_KEY
6 MOLWEIGHT
7 ALOGP
8 PSA
9 NUM_RO5_VIOLATIONS
10 CANONICAL_SMILES
11 ACTIVITY_ID
12 STANDARD_TYPE
13 RELATION
14 STANDARD_VALUE
15 STANDARD_UNITS
16 PCHEMBL_VALUE
17 PUBLISHED_TYPE
18 PUBLISHED_RELATION
19 PUBLISHED_VALUE
20 PUBLISHED_UNITS
21 ACTIVITY_COMMENT
22 DATA_VALIDITY_COMMENT
23 POTENTIAL_DUPLICATE
24 BAO_ENDPOINT
25 UO_UNITS
26 QUDT_UNITS
27 ASSAY_ID
28 ASSAY_CHEMBLID
29 ASSAY_TYPE
30 DESCRIPTION
31 ASSAY_SRC_ID
32 ASSAY_SRC_DESCRIPTION
33 ASSAY_ORGANISM
34 ASSAY_STRAIN
35 ASSAY_TAX_ID
36 CURATED_BY
37 BAO_FORMAT
38 TID
39 TARGET_CHEMBLID
40 TARGET_TYPE
41 PROTEIN_ACCESSION
42 PREF_NAME
43 ORGANISM
44 CONFIDENCE_SCORE
45 TARGET_MAPPING
46 APD_NAME
47 APD_CONFIDENCE
48 DOC_ID
49 DOC_CHEMBLID
50 PUBMED_ID
51 JOURNAL
52 YEAR
53 VOLUME
54 ISSUE
55 FIRST_PAGE
56 CELL_ID
57 CELL_CHEMBL_ID
58 CELL_NAME
59 ACTIVITY_PARAMS
60 ACTIVITY_PROPS



In [3]:
header

'CMPD_CHEMBLID\tMOLREGNO\tPARENT_CMPD_CHEMBLID\tPARENT_MOLREGNO\tMOL_PREF_NAME\tCOMPOUND_KEY\tMOLWEIGHT\tALOGP\tPSA\tNUM_RO5_VIOLATIONS\tCANONICAL_SMILES\tACTIVITY_ID\tSTANDARD_TYPE\tRELATION\tSTANDARD_VALUE\tSTANDARD_UNITS\tPCHEMBL_VALUE\tPUBLISHED_TYPE\tPUBLISHED_RELATION\tPUBLISHED_VALUE\tPUBLISHED_UNITS\tACTIVITY_COMMENT\tDATA_VALIDITY_COMMENT\tPOTENTIAL_DUPLICATE\tBAO_ENDPOINT\tUO_UNITS\tQUDT_UNITS\tASSAY_ID\tASSAY_CHEMBLID\tASSAY_TYPE\tDESCRIPTION\tASSAY_SRC_ID\tASSAY_SRC_DESCRIPTION\tASSAY_ORGANISM\tASSAY_STRAIN\tASSAY_TAX_ID\tCURATED_BY\tBAO_FORMAT\tTID\tTARGET_CHEMBLID\tTARGET_TYPE\tPROTEIN_ACCESSION\tPREF_NAME\tORGANISM\tCONFIDENCE_SCORE\tTARGET_MAPPING\tAPD_NAME\tAPD_CONFIDENCE\tDOC_ID\tDOC_CHEMBLID\tPUBMED_ID\tJOURNAL\tYEAR\tVOLUME\tISSUE\tFIRST_PAGE\tCELL_ID\tCELL_CHEMBL_ID\tCELL_NAME\tACTIVITY_PARAMS\tACTIVITY_PROPS\n'

In [7]:
fps = [] # Fingerprints
for mol in mols:
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    fps.append(arr)
fps

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 1., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 

In [8]:
fps = np.array(fps) # Change the fingerprint to numpy array

In [9]:
x_train, x_test, y_train, y_test = train_test_split(fps, labels)

In [10]:
x_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
y_pred = rf.predict(x_test)

In [14]:
y_pred


array(['NEG', 'NEG', 'NEG', 'NEG', 'NEG', 'NEG', 'NEG', 'NEG', 'NEG',
       'NEG', 'NEG', 'NEG', 'NEG', 'NEG', 'POS', 'NEG', 'POS', 'NEG',
       'NEG'], dtype='<U3')

In [15]:
confusion_matrix(y_test, y_pred)

array([[14,  0],
       [ 3,  2]])

In [16]:
f1_score(y_test, y_pred, pos_label="POS")

0.5714285714285715

### Predict the efficacy of drugs (regression problem)

Regression models, as discussed earlier, are models that predict continuous values. This time, create a regression model of RandomForest, and evaluate its accuracy with R2. Let's use the data from hERG's assay data used in classification problem. 

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from math import log10


In [20]:
pIC50s = []

In [24]:
with open('/home/oohnohnoh1/Desktop/GIT/Chemiinformatics_work/py4chemoinformatics/notebooks/ch09_compounds.txt') as f:
    header = f.readline()
    for i, title in enumerate(header.split("\t")):
        #print (i, title)
        if title == "STANDARD_VALUE":
            value_index = i 
    for l in f:
        ls = l.split("\t")
        #print (ls)
        val = float(ls[value_index])
        pIC50 = 9 - log10(val)
        pIC50s.append(pIC50)
pIC50s = np.array(pIC50s)

In [25]:
pIC50s


array([5.40000019, 4.30000003, 8.        , 8.        , 6.49000491,
       5.4700002 , 5.82999953, 6.91998715, 5.        , 6.26000069,
       8.        , 2.85      , 6.59999765, 6.81998888, 5.41000016,
       5.        , 3.88999998, 4.30000003, 4.51999996, 3.52      ,
       3.85999999, 5.48999954, 4.04000002, 6.71999105, 4.97999981,
       6.7400002 , 7.79997073, 5.72000016, 6.49000491, 6.6999918 ,
       4.92000016, 7.51999306, 5.83999993, 5.43000027, 6.09000028,
       5.13999998, 3.04      , 5.0100001 , 7.48999149, 7.40000782,
       4.22999997, 3.61      , 5.25000025, 4.75999996, 6.35999585,
       4.56999997, 5.80000087, 6.85001154, 5.11999976, 5.86000134,
       6.49000491, 7.29998894, 7.7000571 , 6.26000069, 5.82000036,
       4.30000003, 5.66000032, 6.6999918 , 8.        , 5.7400002 ,
       6.77001116, 4.68000003, 3.        , 5.        , 4.73999997,
       5.48000016, 7.77006231, 5.52000025, 3.88999998, 4.04000002,
       6.7900027 , 5.84999925, 3.02      , 5.40000019, 4.30000