Please click below to open this notebook with colab.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1r3QAoLsI-k6se1EubeepUs8p0Bqvapb_?usp=sharing)

The Deepchem and dataset setup below was taken from the official tutorial: [link ](https://github.com/deepchem/deepchem/blob/master/examples/tutorials/03_Modeling_Solubility.ipynb)

In [37]:
# Installing Deepchem
!pip install --pre deepchem




In [38]:
import deepchem

Convert IC50 to pIC50
To allow IC50 data to be more uniformly distributed, we will convert IC50 to the negative logarithmic scale which is essentially -log10(IC50).

This custom function pIC50() will accept a DataFrame as input and will:

Take the IC50 values from the standard_value column and converts it from nM to M by multiplying the value by 10
Take the molar value and apply -log10
Delete the standard_value column and create a new pIC50 column

In [39]:
import pandas as pd
df=pd.read_csv('/content/HIV-all-without-pubchem-PAINS_IC50.csv')
df

Unnamed: 0,Smiles,Activity,IC50_uM
0,O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1C...,Yes,0.200000
1,Cc1c(C(C)(C)C)s/c(=N\S(=O)(=O)c2cc(Cl)ccc2N)n1C,No,15.848930
2,O=C(COc1ccc(Cl)cc1C(=O)c1cc(F)cc(F)c1)Nc1ccc(C...,Yes,0.067610
3,CCCOC(=O)N(C(=S)OC(C)COc1ccccc1)c1ccccc1,No,131.000000
4,Cc1[nH]nc(OCC(=O)Nc2ccc(C#CC(C)(C)CO)cc2Cl)c1-...,Yes,0.024000
...,...,...,...
9207,c1cc(C=CC(=S)O2)c2c(C(OC(C(OC3=O)(C(C)(C)C34C)...,No,1905.460718
9208,N1C(=O)C(C)=CN(COCCCOCC(c2ccccc2)=O)C1=O,No,10.715193
9209,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCC[Se]c3ccccc3)O...,No,75.857758
9210,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCCC[Se]c3ccccc3)...,No,67.608298


In [40]:
# to keep the value positive
def norm_value(input):
    norm = []

    for i in input['IC50_uM']:
        if i > 100000:
          i = 100000
        norm.append(i)

    input['IC50_uM_norm'] = norm
    x = input.drop('IC50_uM', 1)

    return x

In [41]:
df_norm = norm_value(df)
df_norm

  x = input.drop('IC50_uM', 1)


Unnamed: 0,Smiles,Activity,IC50_uM_norm
0,O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1C...,Yes,0.200000
1,Cc1c(C(C)(C)C)s/c(=N\S(=O)(=O)c2cc(Cl)ccc2N)n1C,No,15.848930
2,O=C(COc1ccc(Cl)cc1C(=O)c1cc(F)cc(F)c1)Nc1ccc(C...,Yes,0.067610
3,CCCOC(=O)N(C(=S)OC(C)COc1ccccc1)c1ccccc1,No,131.000000
4,Cc1[nH]nc(OCC(=O)Nc2ccc(C#CC(C)(C)CO)cc2Cl)c1-...,Yes,0.024000
...,...,...,...
9207,c1cc(C=CC(=S)O2)c2c(C(OC(C(OC3=O)(C(C)(C)C34C)...,No,1905.460718
9208,N1C(=O)C(C)=CN(COCCCOCC(c2ccccc2)=O)C1=O,No,10.715193
9209,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCC[Se]c3ccccc3)O...,No,75.857758
9210,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCCC[Se]c3ccccc3)...,No,67.608298


In [42]:
import numpy as np

def pIC50(input):
    pIC50 = []

    for i in input['IC50_uM_norm']:
        molar = i*(10**-6) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('IC50_uM_norm', 1)

    return x

In [43]:
df_final = pIC50(df_norm)
df_final

  x = input.drop('IC50_uM_norm', 1)


Unnamed: 0,Smiles,Activity,pIC50
0,O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1C...,Yes,6.698970
1,Cc1c(C(C)(C)C)s/c(=N\S(=O)(=O)c2cc(Cl)ccc2N)n1C,No,4.800000
2,O=C(COc1ccc(Cl)cc1C(=O)c1cc(F)cc(F)c1)Nc1ccc(C...,Yes,7.169989
3,CCCOC(=O)N(C(=S)OC(C)COc1ccccc1)c1ccccc1,No,3.882729
4,Cc1[nH]nc(OCC(=O)Nc2ccc(C#CC(C)(C)CO)cc2Cl)c1-...,Yes,7.619789
...,...,...,...
9207,c1cc(C=CC(=S)O2)c2c(C(OC(C(OC3=O)(C(C)(C)C34C)...,No,2.720000
9208,N1C(=O)C(C)=CN(COCCCOCC(c2ccccc2)=O)C1=O,No,4.970000
9209,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCC[Se]c3ccccc3)O...,No,4.120000
9210,N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCCC[Se]c3ccccc3)...,No,4.170000


In [44]:
df_final.to_csv("/content/HIV-all-without-pubchem-PAINS_pIC50.csv")

In [45]:
df = r"/content/HIV-all-without-pubchem-PAINS_pIC50.csv"
df

'/content/HIV-all-without-pubchem-PAINS_pIC50.csv'

In [46]:
loader = deepchem.data.CSVLoader(tasks=["pIC50"],
                                 smiles_field="Smiles",
                                 featurizer=deepchem.feat.ConvMolFeaturizer())



In [47]:
dataset = loader.featurize(df)

[19:09:11] Explicit valence for atom # 7 N, 5, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[19:09:14] Explicit valence for atom # 10 Cl, 2, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[19:09:14] Explicit valence for atom # 10 Cl, 2, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[19:09:14] Explicit valence for atom # 10 Cl, 2, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRa

## Introducing Scaffold splitter

In [48]:
    # Splitter splits the dataset
    # In this case it's is an equivalent of train_test_split from sklearn
    splitter = deepchem.splits.ScaffoldSplitter()
    # frac_test is 0.01 because we only use a train and valid as an example
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                      frac_train=0.7,
                                                      frac_valid=0.29,
                                                      frac_test=0.01)
    # Normalizer will normalize y values in the dataset
    normalizer = deepchem.trans.NormalizationTransformer(transform_y=True,
                                                         dataset=train,
                                                         move_mean=True)
    train = normalizer.transform(train)
    valid = normalizer.transform(valid)
    test = normalizer.transform(test)

In [49]:
print(f"Size of the training data: {len(train.ids)}")
print(f"Size of the validation data: {len(valid.ids)}")
print(test)

Size of the training data: 6285
Size of the validation data: 2604
<DiskDataset X.shape: (90,), y.shape: (90, 1), w.shape: (90, 1), ids: ['O=c1cc(-c2ccc(O)cc2)oc2cc(O)c(-c3c(O)cc(O)c4c(=O)cc(-c5ccc(O)cc5)oc34)c(O)c12'
 'N#Cc1cc(Cl)cc(Oc2c(CCOc3cccnc3)c[nH]c(=O)c2Br)c1'
 'CCN(c1nc(Nc2ccc(C#N)cc2)nc(Oc2c(C)cc(C)cc2C)n1)N1CCOCC1' ...
 'O=C(O)c1nc(Cc2c(Br)sc3ccc(Cl)cc23)nc(O)c1O'
 'N#Cc1cc(Cl)cc(Oc2cc(OCc3n[nH]c4ccccc34)ccc2Cl)c1'
 'O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1CCCC1'], task_names: ['pIC50']>


In [50]:
from deepchem.models import GraphConvModel
model = GraphConvModel(1,
                       batch_size=50,
                       mode="regression")

In [55]:
# Fitting the model
model.fit(train, nb_epoch=100)

0.06186284422874451

In [56]:
metric = deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score)

In [57]:
# Reversing the transformation and getting the metric scores on 2 datasets
train_scores = model.evaluate(train, [metric], [normalizer])
valid_scores = model.evaluate(valid, [metric], [normalizer])
test_scores = model.evaluate(test, [metric], [normalizer])
print(f"Train Scores: {train_scores}")
print(f"Validation Scores: {valid_scores}")
print(f"Test Scores: {test_scores}")

Train Scores: {'pearson_r2_score': 0.9444810202672272}
Validation Scores: {'pearson_r2_score': 0.21640650798450672}
Test Scores: {'pearson_r2_score': 0.3755627201252817}
